Refactor: Unified pipeline execution, simplified UI, and fixed Docker config

- Backend: Refactored tasks.py to directly invoke run_single_fna_pipeline.py for consistency. - Backend: Changed output format to ZIP and added auto-cleanup of intermediate files. - Backend: Fixed language parameter passing in API and tasks. - Frontend: Removed CRISPR Fusion UI elements from Submit and Monitor views. - Frontend: Implemented simulated progress bar for better UX. - Frontend: Restored One-click load button and added result file structure documentation. - Docker: Fixed critical Restarting loop by removing incorrect image directive in docker-compose.yml. - Docker: Optimized Dockerfile to correct .pixi environment path issues and prevent accidental deletion of frontend assets.
2026-01-20 20:25:25 +08:00
parent 5067169b0b
commit c75c85c53b
134 changed files with 146457 additions and 996647 deletions
--- a/scripts/bttoxin_shoter.py
+++ b/scripts/bttoxin_shoter.py
@@ -15,6 +15,7 @@ from __future__ import annotations
 import argparse
 import json
 import re
+import math
 from dataclasses import dataclass, asdict, field
 import os
 from pathlib import Path
@@ -436,7 +437,58 @@ def partner_fulfilled_for_hit(hit_family: str, strain_df: pd.DataFrame, index: S
    return False


-def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tuple[List[ToxinHit], StrainScores, Optional[StrainSpeciesScores]]:
+def apply_logit_prior(
+    score: float,
+    bgc_data: Dict[str, int],
+    mobilome_data: Dict[str, int],
+    crispr_state: int,
+    betas: Dict[str, float]
+) -> float:
+    """
+    Apply Logit Prior adjustment: S_final = sigmoid(logit(S_tox) + Delta)
+    Delta = beta_z*b_z + beta_t*b_t + beta_a*b_a + beta_m*g(m) + beta_c*h(c)
+    """
+    if score <= 0: return 0.0
+    if score >= 1: return 1.0
+
+    # epsilon to avoid inf
+    epsilon = 1e-6
+    p = max(epsilon, min(1.0 - epsilon, score))
+    logit_p = math.log(p / (1.0 - p))
+
+    # BGC
+    b_z = bgc_data.get("ZWA", 0)
+    b_t = bgc_data.get("Thu", 0)
+    b_a = bgc_data.get("TAA", 0)
+
+    # Mobilome: g(m) = ln(1+m)
+    m = mobilome_data.get("mobile_elements_count", 0)
+    g_m = math.log(1.0 + m)
+
+    # CRISPR: h(c) = 1 - c/2 (0->1, 1->0.5, 2->0)
+    # c in [0, 1, 2]
+    c = crispr_state
+    h_c = 1.0 - (c / 2.0)
+
+    delta = (betas["z"] * b_z) + \
+            (betas["t"] * b_t) + \
+            (betas["a"] * b_a) + \
+            (betas["m"] * g_m) + \
+            (betas["c"] * h_c)
+
+    final_logit = logit_p + delta
+    return 1.0 / (1.0 + math.exp(-final_logit))
+
+
+def score_strain(
+    strain: str,
+    sdf: pd.DataFrame,
+    index: SpecificityIndex,
+    crispr_associations: Dict[str, Any] = None,
+    crispr_weight: float = 0.0,
+    context_data: Dict[str, Any] = None,
+    betas: Dict[str, float] = None
+) -> Tuple[List[ToxinHit], StrainScores, Optional[StrainSpeciesScores]]:
    # Include special buckets per requirements
    order_set = sorted({*index.all_orders, "other", "unknown"})
    per_hit: List[ToxinHit] = []
@@ -477,6 +529,14 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
        if not fulfilled:
            w *= 0.2  # partner penalty if required but not present

+        # CRISPR Boost (Hit-level)
+        if crispr_associations and crispr_weight > 0:
+            # Check if this toxin is associated with CRISPR
+            # keys in crispr_associations are toxin names
+            assoc = crispr_associations.get(hit_id) or crispr_associations.get(name_key)
+            if assoc:
+                 w = min(1.0, w + crispr_weight)
+
        contribs: Dict[str, float] = {}
        for o in order_set:
            p = float(order_dist.get(o, 0.0))
@@ -523,7 +583,21 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
            )
        )

+    # 1. Calculate S_tox (Noisy-OR)
    order_scores = {o: (1.0 - one_minus[o]) for o in order_set}
+
+    # 2. Apply Logit Prior if context data is present
+    if context_data and betas:
+        bgc = context_data.get("bgc", {})
+        mobi = context_data.get("mobilome", {})
+        crispr_st = context_data.get("crispr_state", 0)
+
+        for o in order_scores:
+            s_tox = order_scores[o]
+            if s_tox > 0:
+                s_final = apply_logit_prior(s_tox, bgc, mobi, crispr_st, betas)
+                order_scores[o] = s_final
+
    # choose top order excluding unresolved buckets if possible
    preferred = [o for o in index.all_orders if o in order_scores]
    if not preferred:
@@ -538,6 +612,20 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
    species_scores_obj: Optional[StrainSpeciesScores] = None
    if species_set:
        species_scores = {sp: (1.0 - sp_one_minus[sp]) for sp in species_set}
+
+        # Apply Logit Prior to Species scores too?
+        # The math doc focuses on "Strain x Order", but presumably it applies to species too if we follow the logic.
+        # "combine evidence... then prior". Let's apply it for consistency.
+        if context_data and betas:
+            bgc = context_data.get("bgc", {})
+            mobi = context_data.get("mobilome", {})
+            crispr_st = context_data.get("crispr_state", 0)
+            for sp in species_scores:
+                s_tox = species_scores[sp]
+                if s_tox > 0:
+                    s_final = apply_logit_prior(s_tox, bgc, mobi, crispr_st, betas)
+                    species_scores[sp] = s_final
+
        if species_scores:
            top_sp = max(species_set, key=lambda sp: species_scores.get(sp, 0.0))
            top_sp_score = float(species_scores.get(top_sp, 0.0))
@@ -635,8 +723,87 @@ def main():
                    help="Keep only hits that map to a known name/subfamily/family in the specificity index")
    ap.add_argument("--min_identity", type=float, default=0.0, help="Minimum identity (0-1) to keep a hit")
    ap.add_argument("--min_coverage", type=float, default=0.0, help="Minimum coverage (0-1) to keep a hit")
+
+    # CRISPR Integration
+    ap.add_argument("--crispr_results", type=Path, default=None, help="Path to CRISPR Fusion analysis results (JSON) for hit-level boost")
+    ap.add_argument("--crispr_fusion", action="store_true", help="Use fusion analysis results for stronger evidence")
+    ap.add_argument("--crispr_weight", type=float, default=0.1, help="Weight boost for CRISPR-associated toxins (0-1)")
+
+    # Genome Context Priors (Strain-level)
+    ap.add_argument("--context_bgc", type=Path, default=None, help="Path to BGC detection results (JSON)")
+    ap.add_argument("--context_mobilome", type=Path, default=None, help="Path to Mobilome analysis results (JSON)")
+    ap.add_argument("--context_crispr", type=Path, default=None, help="Path to CRISPR detection results (JSON) for strain-level prior")
+
+    # Prior Weights (Defaults from math doc)
+    ap.add_argument("--beta_z", type=float, default=0.5, help="Weight for ZWA BGC")
+    ap.add_argument("--beta_t", type=float, default=0.5, help="Weight for Thu BGC")
+    ap.add_argument("--beta_a", type=float, default=0.5, help="Weight for TAA BGC")
+    ap.add_argument("--beta_m", type=float, default=0.5, help="Weight for Mobilome")
+    ap.add_argument("--beta_c", type=float, default=0.5, help="Weight for CRISPR state")
+
    args = ap.parse_args()

+    # Load CRISPR data if available
+    crispr_associations = {}
+    if args.crispr_results and args.crispr_results.exists():
+        try:
+            with open(args.crispr_results) as f:
+                cdata = json.load(f)
+                # If fusion analysis results (has 'associations')
+                if "associations" in cdata:
+                    for assoc in cdata["associations"]:
+                         toxin_name = assoc.get("toxin")
+                         if toxin_name:
+                             # Normalize name if possible or keep as is.
+                             # Digger outputs might have variants, but fusion usually uses specific names.
+                             crispr_associations[toxin_name] = assoc
+                print(f"[Shoter] Loaded {len(crispr_associations)} CRISPR associations")
+        except Exception as e:
+            print(f"[Shoter] Failed to load CRISPR results: {e}")
+
+    # Load Prior Data
+    context_data = {
+        "bgc": {},
+        "mobilome": {},
+        "crispr_state": 0
+    }
+
+    # BGC
+    if args.context_bgc and args.context_bgc.exists():
+        try:
+            with open(args.context_bgc) as f:
+                context_data["bgc"] = json.load(f)
+            print(f"[Shoter] Loaded BGC context: {context_data['bgc']}")
+        except Exception as e:
+            print(f"[Shoter] Failed to load BGC context: {e}")
+
+    # Mobilome
+    if args.context_mobilome and args.context_mobilome.exists():
+        try:
+            with open(args.context_mobilome) as f:
+                context_data["mobilome"] = json.load(f)
+            print(f"[Shoter] Loaded Mobilome context: {context_data['mobilome']}")
+        except Exception as e:
+            print(f"[Shoter] Failed to load Mobilome context: {e}")
+
+    # CRISPR State
+    if args.context_crispr and args.context_crispr.exists():
+        try:
+            with open(args.context_crispr) as f:
+                cdata = json.load(f)
+                context_data["crispr_state"] = cdata.get("crispr_state", 0)
+            print(f"[Shoter] Loaded CRISPR state: {context_data['crispr_state']}")
+        except Exception as e:
+            print(f"[Shoter] Failed to load CRISPR context: {e}")
+
+    betas = {
+        "z": args.beta_z,
+        "t": args.beta_t,
+        "a": args.beta_a,
+        "m": args.beta_m,
+        "c": args.beta_c
+    }
+
    index = SpecificityIndex.from_csv(args.toxicity_csv)

    df = parse_all_toxins(args.all_toxins)
@@ -672,7 +839,13 @@ def main():

    for strain in strains:
        sdf = df[df["Strain"].astype(str).eq(strain)].copy()
-        per_hit, sscore, sspecies = score_strain(strain, sdf, index)
+        per_hit, sscore, sspecies = score_strain(
+            strain, sdf, index,
+            crispr_associations=crispr_associations,
+            crispr_weight=args.crispr_weight if args.crispr_fusion else 0.0,
+            context_data=context_data,
+            betas=betas
+        )
        all_hits.extend(per_hit)
        all_strain_scores.append(sscore)
        if sspecies is not None: