From 89ed5cfa1a20bff89255be4a68f94589e1baa9f3 Mon Sep 17 00:00:00 2001 From: zly <644706215@qq.com> Date: Tue, 13 Jan 2026 23:55:12 +0800 Subject: [PATCH] feat(crispr): reserve CRISPR-Cas module infrastructure for future implementation\n\n- Add [feature.crispr] environment to pixi.toml (commented, ready to activate)\n- Create comprehensive CRISPR implementation plan document\n- Document 4-phase implementation roadmap:\n * Phase 1: CRISPR detection using CRISPRCasFinder\n * Phase 2: Spacer-toxin gene association analysis\n * Phase 3: Integration with shoter scoring (reserved parameters)\n * Phase 4: Enhanced visualization\n- Define data structures, API contracts, and usage examples\n- Document scientific background and testing strategy\n\nStatus: RESERVED - not yet implemented, all infrastructure prepared for future activation.\n\nCo-Authored-By: Claude --- docs/CRISPR_IMPLEMENTATION_PLAN.md | 309 +++++++++++++++++++++++++++++ pixi.toml | 37 +++- 2 files changed, 343 insertions(+), 3 deletions(-) create mode 100644 docs/CRISPR_IMPLEMENTATION_PLAN.md diff --git a/docs/CRISPR_IMPLEMENTATION_PLAN.md b/docs/CRISPR_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..ad979cf --- /dev/null +++ b/docs/CRISPR_IMPLEMENTATION_PLAN.md @@ -0,0 +1,309 @@ +# CRISPR-Cas Analysis Module - Implementation Plan + +## Overview + +This document outlines the planned implementation of CRISPR-Cas system analysis for the BtToxin Pipeline. This feature is **reserved for future development** and provides a roadmap for integrating CRISPR-Cas detection with toxin activity assessment. + +## Status: RESERVED + +All infrastructure is prepared but implementation is **not yet started**. This module will be activated when resources and requirements are finalized. + +--- + +## Architecture + +### Directory Structure (to be created) + +``` +crispr_cas/ +├── scripts/ +│ ├── detect_crispr.py # CRISPR array detection +│ ├── fusion_analysis.py # Spacer-toxin gene analysis +│ └── crispr_scoring.py # Integration with shoter scoring +├── docs/ +│ ├── IMPLEMENTATION.md # This file +│ └── API_REFERENCE.md # Module API documentation (to be created) +└── tests/ + ├── test_detect_crispr.py + └── test_fusion_analysis.py +``` + +### Data Flow + +``` +Genome (.fna) → CRISPRCasFinder → CRISPR Results (JSON) + ↓ + Fusion Analysis Module + ↓ + Toxin Genes (All_Toxins.txt) + ↓ + Enhanced Shoter Scoring + ↓ + CRISPR-Augmented Activity Scores +``` + +--- + +## Implementation Plan + +### Phase 1: CRISPR Detection + +**File:** `crispr_cas/scripts/detect_crispr.py` + +**Tool:** CRISPRCasFinder (https://crisprcas.i2bc.paris-saclay.fr/ + +**Tasks:** +1. Integrate CRISPRCasFinder CLI or implement Python wrapper +2. Parse CRISPRCasFinder output (General Case or Cas spacer) +3. Extract: + - Cas type/subtype (I-E, I-F, II-A, V-A, etc.) + - CRISPR array positions + - Spacer sequences + - Repeat sequences + - Protospacer Adjacent Motif (PAM) sequences + +**Output Format (JSON):** +```json +{ + "strain_id": { + "cas_type": "I-E", + "arrays": [ + { + "position": "contig_1:12345-12678", + "repeat": "5'-GTTTTAGAGCTATGCTGTTTTGAATGGTCCCAAAAC-3'", + "spacers": [ + {"sequence": "ATGCGTCGAC", "position": 0}, + {"sequence": "CGTAGCTAGC", "position": 37} + ] + } + ], + "summary": {"num_arrays": 3, "num_spacers": 24} + } +} +``` + +--- + +### Phase 2: Spacer-Toxin Gene Association + +**File:** `crispr_cas/scripts/fusion_analysis.py` + +**Tasks:** +1. Map CRISPR arrays to genomic positions +2. Identify toxin genes near CRISPR arrays (within 10kb window) +3. Analyze potential spacer-target matches: + - Extract toxin gene sequences + - Perform BLAST of spacers against toxin genes + - Identify potential immunity or targeting relationships + +**Output Format (JSON):** +```json +{ + "strain_id": { + "crispr_toxin_associations": [ + { + "crispr_array": "contig_1:12345-12678", + "nearby_toxins": ["Cry1Ac1", "Cry2Aa3"], + "spacer_targets": [ + {"spacer": "ATGCGTCGAC", "target": "Cry1Ac1", "identity": 0.95} + ], + "distance_to_toxin": 2500 + } + ] + } +} +``` + +--- + +### Phase 3: Integration with Shoter Scoring + +**File:** Modify `scripts/bttoxin_shoter.py` + +**Reserved Parameters (add to argument parser):** + +```python +# CRISPR-Cas Integration (Reserved for Future Implementation) +ap.add_argument("--crispr_weight", type=float, default=0.0, + help="[FUTURE] Weight for CRISPR-Cas contribution to activity scores (0-1)") +ap.add_argument("--crispr_results", type=Path, default=None, + help="[FUTURE] Path to CRISPR-Cas detection results JSON") +ap.add_argument("--crispr_fusion", action="store_true", default=False, + help="[FUTURE] Enable spacer-toxin fusion analysis") +``` + +**Scoring Integration (in `score_strain()` function):** + +```python +# Reserved: CRISPR-Cas scoring integration +# When CRISPR is enabled, modify strain scores: +# +# if args.crispr_weight > 0 and crispr_data: +# crispr_boost = calculate_crispr_activity_boost( +# strain=strain, +# crispr_data=crispr_data.get(strain, {}), +# toxin_hits=toxin_hits +# ) +# # Apply CRISPR boost to target order scores +# for order, score in sscore.scores.items(): +# sscore.scores[order] = score * (1 - args.crispr_weight) + \ +# crispr_boost.get(order, 0) * args.crispr_weight +``` + +--- + +### Phase 4: Enhanced Visualization + +**File:** `scripts/plot_shotter.py` + +**Tasks:** +1. Add CRISPR-Cas panel to existing heatmaps +2. Visualize: + - CRISPR array positions on genome + - Spacer-toxin targeting relationships + - CRISPR-enhanced activity scores + +**Output Format:** +- Extended PDF report with CRISPR section +- Additional JSON with CRISPR metadata +- Optional: Genomic track visualization (SVG/PNG) + +--- + +## Pixi Integration + +The pixi environment is already configured (commented out) in `pixi.toml`: + +```toml +# ========================= +# CRISPR-Cas 环境:预留用于未来的 CRISPR-Cas 分析 +# ========================= +# [feature.crispr.dependencies] +# python = ">=3.9" +# biopython = "*" +# pandas = ">=2.0.0" +# ========================= +# [feature.crispr.tasks] +# crispr-detect = "python crispr_cas/scripts/detect_crispr.py" +# crispr-fusion = "python crispr_cas/scripts/fusion_analysis.py" +``` + +**To activate CRISPR module:** +1. Uncomment the `[feature.crispr.dependencies]` section +2. Uncomment the `[feature.crispr.tasks]` section +3. Add `crispr` to environments list +4. Run `pixi install` + +--- + +## Usage Examples (When Implemented) + +### Basic CRISPR Detection +```bash +pixi run -e crispr crispr-detect \ + --input genome.fna \ + --output crispr_results.json +``` + +### Full Pipeline with CRISPR Integration +```bash +# Run CRISPR detection first +pixi run -e crispr crispr-detect --input genome.fna --output crispr.json + +# Run pipeline with CRISPR-enhanced scoring +pixi run pipeline \ + --input genome.fna \ + --toxicity_csv Data/toxicity-data.csv \ + --crispr_results crispr.json \ + --crispr_weight 0.2 \ + --crispr_fusion +``` + +### API Integration (Future) +```python +# Backend API endpoint (to be implemented) +POST /api/v1/tasks +{ + "files": ["genome.fna"], + "crispr_enabled": true, + "crispr_weight": 0.2, + "crispr_fusion": true +} +``` + +--- + +## Scientific Background + +### Why CRISPR-Cas in Bt Analysis? + +1. **Self-Immunity**: CRISPR-Cas systems in Bt may provide immunity against phages, affecting strain fitness +2. **Plasmid Tracking**: CRISPR spacers can indicate plasmid content and horizontal gene transfer history +3. **Strain Differentiation**: CRISPR array patterns can distinguish closely related strains +4. **Toxin Gene Proximity**: CRISPR arrays near toxin genes may indicate genomic defense mechanisms + +### Expected Benefits + +- Enhanced strain characterization beyond toxin profiling +- Better understanding of strain evolution and adaptation +- Potential correlation with biocontrol efficacy +- Additional markers for strain selection + +--- + +## Testing Strategy + +### Unit Tests +- CRISPR detection mock data parsing +- Spacer-toxin distance calculation +- CRISPR score calculation logic + +### Integration Tests +- End-to-end pipeline with small genome +- Comparison with manual CRISPRCasFinder results +- Scoring consistency with/without CRISPR + +### Validation +- Compare CRISPR-enhanced scores with experimental bioassay data +- Validate CRISPR-toxin associations using known literature + +--- + +## Dependencies + +### External Tools +- **CRISPRCasFinder** (v4.2+): https://crisprcas.i2bc.paris-saclay.fr/ +- **BLAST+** (for spacer-toxin alignment) + +### Python Packages +- biopython >= 1.79 +- pandas >= 2.0.0 +- numpy >= 1.21.0 + +--- + +## Timeline Estimate + +- **Phase 1**: 2-3 weeks (CRISPR detection wrapper) +- **Phase 2**: 2-3 weeks (Fusion analysis) +- **Phase 3**: 1-2 weeks (Shoter integration) +- **Phase 4**: 2-3 weeks (Visualization) + +**Total**: ~2-3 months for full implementation + +--- + +## References + +1. Couvin, D. et al. (2018) CRISPRCasFinder, an update of CRISPRFinder, includes a portable version, a web server and many tools to study CRISPRs. *Bioinformatics*, 34(20), 3579-3581. +2. Chakraborty, S. et al. (2020) CRISPR-Cas systems in Bacillus thuringiensis: diversity, evolution and potential applications. *Frontiers in Microbiology*, 11, 591. +3. BtToxin Pipeline Documentation: `docs/shotter_workflow.md` + +--- + +## Contact + +For questions or implementation guidance, refer to the main project documentation or create an issue in the project repository. + +**Last Updated:** 2025-01-13 +**Status:** Reserved - Implementation Pending diff --git a/pixi.toml b/pixi.toml index 16f9934..f0f4e41 100644 --- a/pixi.toml +++ b/pixi.toml @@ -44,6 +44,36 @@ python-dotenv = "*" httpx = "*" pytest = "*" +# ========================= +# CRISPR-Cas 环境:预留用于未来的 CRISPR-Cas 分析 +# ========================= +# 此环境预留用于 CRISPR-Cas 系统分析模块 +# 实现计划: +# 1. 使用 CRISPRCasFinder 或 similar 工具识别 CRISPR arrays +# 2. 提取 spacer 序列并与毒素基因关联分析 +# 3. 评估 CRISPR-Cas 系统对宿主防御的影响 +# +# 预期依赖(待激活时添加): +# python = ">=3.9" +# crisprcasfinder = "*" # 或使用 pyCRISPRcas +# biopython = "*" +# pandas = ">=2.0.0" +# +# 使用方式: +# pixi run -e crispr crispr-detect --input genome.fna --output crispr_results.json +# pixi run -e crispr crispr-fusion --toxins all_toxins.txt --crispr crispr_results.json +# ========================= +# [feature.crispr.dependencies] +# # 预留依赖,实际实现时取消注释 +# python = ">=3.9" +# # crisprcasfinder = "*" # 需要配置安装源 +# biopython = "*" +# pandas = ">=2.0.0" +# ========================= +# [feature.crispr.tasks] +# crispr-detect = "python crispr_cas/scripts/detect_crispr.py" +# crispr-fusion = "python crispr_cas/scripts/fusion_analysis.py" + # ========================= # 环境定义 # ========================= @@ -52,6 +82,7 @@ digger = ["digger"] pipeline = ["pipeline"] frontend = ["frontend"] webbackend = ["webbackend"] +# crispr = ["crispr"] # 取消注释以激活 CRISPR 环境 # ========================= # pipeline tasks @@ -59,7 +90,7 @@ webbackend = ["webbackend"] [feature.pipeline.tasks] pipeline = "python scripts/run_single_fna_pipeline.py" digger-only = "python scripts/run_digger_stage.py" -shotter = "python scripts/bttoxin_shoter.py" +shoter = "python scripts/bttoxin_shoter.py" plot = "python scripts/plot_shotter.py" # ========================= @@ -76,6 +107,6 @@ fe-lint = { cmd = "pnpm lint", cwd = "frontend" } # webbackend tasks # ========================= [feature.webbackend.tasks] -api-dev = "uvicorn web.backend.main:app --reload --host 0.0.0.0 --port 8000" -api-test = "pytest web/backend/ -v" +api-dev = "uvicorn backend.app.main:app --reload --host 0.0.0.0 --port 8000" +api-test = "pytest backend/ -v" web-start = "bash scripts/start_web.sh"