Refactor: Unified pipeline execution, simplified UI, and fixed Docker config

- Backend: Refactored tasks.py to directly invoke run_single_fna_pipeline.py for consistency. - Backend: Changed output format to ZIP and added auto-cleanup of intermediate files. - Backend: Fixed language parameter passing in API and tasks. - Frontend: Removed CRISPR Fusion UI elements from Submit and Monitor views. - Frontend: Implemented simulated progress bar for better UX. - Frontend: Restored One-click load button and added result file structure documentation. - Docker: Fixed critical Restarting loop by removing incorrect image directive in docker-compose.yml. - Docker: Optimized Dockerfile to correct .pixi environment path issues and prevent accidental deletion of frontend assets.
2026-01-20 20:25:25 +08:00
parent 5067169b0b
commit c75c85c53b
134 changed files with 146457 additions and 996647 deletions
--- a/tools/crispr_cas_analysis/scripts/init.py
+++ b/tools/crispr_cas_analysis/scripts/init.py
@@ -0,0 +1 @@
+"""Scripts for CRISPR-Cas detection and analysis"""
--- a/tools/crispr_cas_analysis/scripts/detect_crispr.py
+++ b/tools/crispr_cas_analysis/scripts/detect_crispr.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+CRISPR-Cas Detection Wrapper
+Wrapper for CRISPRCasFinder or similar tools to detect CRISPR arrays and Cas genes.
+"""
+
+import argparse
+import json
+import logging
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import Dict, List, Any
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Detect CRISPR arrays and Cas genes in genome")
+    parser.add_argument("--input", "-i", type=Path, required=True, help="Input genome file (.fna)")
+    parser.add_argument("--output", "-o", type=Path, required=True, help="Output JSON results file")
+    parser.add_argument("--tool-path", type=Path, default=None, help="Path to CRISPRCasFinder.pl")
+    parser.add_argument("--mock", action="store_true", help="Use mock data (for testing without external tools)")
+    return parser.parse_args()
+
+def check_dependencies(tool_path: Path = None) -> bool:
+    """Check if CRISPRCasFinder is available"""
+    if tool_path and tool_path.exists():
+        return True
+
+    # Check in PATH
+    if shutil.which("CRISPRCasFinder.pl"):
+        return True
+
+    return False
+
+def generate_mock_results(genome_file: Path) -> Dict[str, Any]:
+    """Generate mock CRISPR results for testing"""
+    logger.info(f"Generating mock CRISPR results for {genome_file.name}")
+
+    strain_id = genome_file.stem
+
+    return {
+        "strain_id": strain_id,
+        "cas_systems": [
+            {
+                "type": "I-E",
+                "subtype": "I-E",
+                "position": "contig_1:15000-25000",
+                "genes": ["cas1", "cas2", "cas3", "casA", "casB", "casC", "casD", "casE"]
+            }
+        ],
+        "arrays": [
+            {
+                "id": "CRISPR_1",
+                "contig": "contig_1",
+                "start": 12345,
+                "end": 12678,
+                "consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC",
+                "num_spacers": 5,
+                "spacers": [
+                    {"sequence": "ATGCGTCGACATGCGTCGACATGCGTCGAC", "position": 1},
+                    {"sequence": "CGTAGCTAGCCGTAGCTAGCCGTAGCTAGC", "position": 2},
+                    {"sequence": "TGCATGCATGTGCATGCATGTGCATGCATG", "position": 3},
+                    {"sequence": "GCTAGCTAGCGCTAGCTAGCGCTAGCTAGC", "position": 4},
+                    {"sequence": "AAAAATTTTTAAAAATTTTTAAAAATTTTT", "position": 5}
+                ]
+            },
+            {
+                "id": "CRISPR_2",
+                "contig": "contig_2",
+                "start": 50000,
+                "end": 50500,
+                "consensus_repeat": "GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC",
+                "num_spacers": 8,
+                "spacers": [
+                    {"sequence": "CCCGGGAAACCCGGGAAACCCGGGAAA", "position": 1}
+                ]
+            }
+        ],
+        "summary": {
+            "has_cas": True,
+            "has_crispr": True,
+            "num_arrays": 2,
+            "num_spacers": 13,
+            "cas_types": ["I-E"]
+        },
+        "metadata": {
+            "tool": "CRISPRCasFinder",
+            "version": "Mock-v1.0",
+            "date": "2025-01-14"
+        }
+    }
+
+def run_crisprcasfinder(input_file: Path, output_file: Path, tool_path: Path = None):
+    """Run actual CRISPRCasFinder tool (Placeholder)"""
+    # This would implement the actual subprocess call to CRISPRCasFinder.pl
+    # For now, we raise NotImplementedError unless mock is used
+    raise NotImplementedError("Real tool integration not yet implemented. Use --mock flag.")
+
+def main():
+    args = parse_args()
+
+    if not args.input.exists():
+        logger.error(f"Input file not found: {args.input}")
+        sys.exit(1)
+
+    # Create parent directory for output if needed
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        if args.mock:
+            results = generate_mock_results(args.input)
+        else:
+            if not check_dependencies(args.tool_path):
+                logger.warning("CRISPRCasFinder not found. Falling back to mock data.")
+                results = generate_mock_results(args.input)
+            else:
+                # Real implementation would go here
+                run_crisprcasfinder(args.input, args.output, args.tool_path)
+                return
+
+        # Write results
+        with open(args.output, 'w') as f:
+            json.dump(results, f, indent=2)
+
+        logger.info(f"Results written to {args.output}")
+
+    except Exception as e:
+        logger.error(f"Error executing CRISPR detection: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
--- a/tools/crispr_cas_analysis/scripts/fusion_analysis.py
+++ b/tools/crispr_cas_analysis/scripts/fusion_analysis.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+CRISPR-Toxin Fusion Analysis
+Analyzes associations between CRISPR spacers and toxin genes.
+"""
+
+import argparse
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Dict, List, Any
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Analyze CRISPR-Toxin associations")
+    parser.add_argument("--crispr-results", type=Path, required=True, help="CRISPR detection results (JSON)")
+    parser.add_argument("--toxin-results", type=Path, required=True, help="Toxin detection results (JSON or TXT)")
+    parser.add_argument("--genome", type=Path, required=True, help="Original genome file (.fna)")
+    parser.add_argument("--output", "-o", type=Path, required=True, help="Output analysis JSON")
+    parser.add_argument("--mock", action="store_true", help="Use mock analysis logic")
+    return parser.parse_args()
+
+def load_json(path: Path) -> Dict:
+    with open(path) as f:
+        return json.load(f)
+
+def calculate_distance(range1: str, range2: str) -> int:
+    """
+    Calculate distance between two genomic ranges.
+    Format: 'contig:start-end'
+    """
+    try:
+        contig1, coords1 = range1.split(':')
+        start1, end1 = map(int, coords1.split('-'))
+
+        contig2, coords2 = range2.split(':')
+        start2, end2 = map(int, coords2.split('-'))
+
+        if contig1 != contig2:
+            return -1 # Different contigs
+
+        # Check for overlap
+        if max(start1, start2) <= min(end1, end2):
+            return 0
+
+        # Calculate distance
+        if start1 > end2:
+            return start1 - end2
+        else:
+            return start2 - end1
+    except Exception as e:
+        logger.warning(f"Error calculating distance: {e}")
+        return -1
+
+def mock_blast_spacers(spacers: List[str], toxins: List[Dict]) -> List[Dict]:
+    """Mock BLAST spacers against toxins"""
+    matches = []
+    # Simulate a match if 'Cry' is in the spacer name (just for demo logic) or random
+    # In reality, we'd blast sequences.
+
+    # Let's just create a fake match for the first spacer
+    if spacers and toxins:
+        matches.append({
+            "spacer_seq": spacers[0],
+            "target_toxin": toxins[0].get("name", "Unknown"),
+            "identity": 98.5,
+            "alignment_length": 32,
+            "mismatches": 1
+        })
+    return matches
+
+def perform_fusion_analysis(crispr_data: Dict, toxin_file: Path, mock: bool = False) -> Dict:
+    """
+    Main analysis logic.
+    1. Map CRISPR arrays
+    2. Map Toxin genes
+    3. Calculate distances
+    4. Check for spacer matches
+    """
+
+    analysis_results = {
+        "strain_id": crispr_data.get("strain_id"),
+        "associations": [],
+        "summary": {"proximal_pairs": 0, "spacer_matches": 0}
+    }
+
+    # Extract arrays
+    arrays = crispr_data.get("arrays", [])
+
+    # Mock Toxin Parsing (assuming simple list for now if not JSON)
+    toxins = []
+    if mock:
+        toxins = [
+            {"name": "Cry1Ac1", "position": "contig_1:10000-12000"},
+            {"name": "Vip3Aa1", "position": "contig_2:60000-62000"}
+        ]
+    else:
+        # TODO: Implement real toxin file parsing (e.g. from All_Toxins.txt)
+        logger.warning("Real toxin parsing not implemented yet, using empty list")
+
+    # Analyze Proximity
+    for array in arrays:
+        array_pos = f"{array.get('contig')}:{array.get('start')}-{array.get('end')}"
+
+        for toxin in toxins:
+            dist = calculate_distance(array_pos, toxin["position"])
+
+            if dist != -1 and dist < 10000: # 10kb window
+                association = {
+                    "type": "proximity",
+                    "array_id": array.get("id"),
+                    "toxin": toxin["name"],
+                    "distance": dist,
+                    "array_position": array_pos,
+                    "toxin_position": toxin["position"]
+                }
+                analysis_results["associations"].append(association)
+                analysis_results["summary"]["proximal_pairs"] += 1
+
+    # Analyze Spacer Matches (Mock)
+    all_spacers = []
+    for array in arrays:
+        for spacer in array.get("spacers", []):
+            all_spacers.append(spacer.get("sequence"))
+
+    matches = mock_blast_spacers(all_spacers, toxins)
+    for match in matches:
+        analysis_results["associations"].append({
+            "type": "spacer_match",
+            **match
+        })
+        analysis_results["summary"]["spacer_matches"] += 1
+
+    return analysis_results
+
+def main():
+    args = parse_args()
+
+    if not args.crispr_results.exists():
+        logger.error(f"CRISPR results file not found: {args.crispr_results}")
+        sys.exit(1)
+
+    try:
+        crispr_data = load_json(args.crispr_results)
+
+        results = perform_fusion_analysis(crispr_data, args.toxin_results, args.mock)
+
+        # Write results
+        with open(args.output, 'w') as f:
+            json.dump(results, f, indent=2)
+
+        logger.info(f"Fusion analysis complete. Results: {args.output}")
+
+    except Exception as e:
+        logger.error(f"Error during fusion analysis: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
				`@@ -0,0 +1 @@`
				`"""Scripts for CRISPR-Cas detection and analysis"""`