Refactor: Unified pipeline execution, simplified UI, and fixed Docker config

- Backend: Refactored tasks.py to directly invoke run_single_fna_pipeline.py for consistency.
- Backend: Changed output format to ZIP and added auto-cleanup of intermediate files.
- Backend: Fixed language parameter passing in API and tasks.
- Frontend: Removed CRISPR Fusion UI elements from Submit and Monitor views.
- Frontend: Implemented simulated progress bar for better UX.
- Frontend: Restored One-click load button and added result file structure documentation.
- Docker: Fixed critical Restarting loop by removing incorrect image directive in docker-compose.yml.
- Docker: Optimized Dockerfile to correct .pixi environment path issues and prevent accidental deletion of frontend assets.
This commit is contained in:
zly
2026-01-20 20:25:25 +08:00
parent 5067169b0b
commit c75c85c53b
134 changed files with 146457 additions and 996647 deletions

View File

@@ -15,6 +15,7 @@ from __future__ import annotations
import argparse
import json
import re
import math
from dataclasses import dataclass, asdict, field
import os
from pathlib import Path
@@ -436,7 +437,58 @@ def partner_fulfilled_for_hit(hit_family: str, strain_df: pd.DataFrame, index: S
return False
def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tuple[List[ToxinHit], StrainScores, Optional[StrainSpeciesScores]]:
def apply_logit_prior(
score: float,
bgc_data: Dict[str, int],
mobilome_data: Dict[str, int],
crispr_state: int,
betas: Dict[str, float]
) -> float:
"""
Apply Logit Prior adjustment: S_final = sigmoid(logit(S_tox) + Delta)
Delta = beta_z*b_z + beta_t*b_t + beta_a*b_a + beta_m*g(m) + beta_c*h(c)
"""
if score <= 0: return 0.0
if score >= 1: return 1.0
# epsilon to avoid inf
epsilon = 1e-6
p = max(epsilon, min(1.0 - epsilon, score))
logit_p = math.log(p / (1.0 - p))
# BGC
b_z = bgc_data.get("ZWA", 0)
b_t = bgc_data.get("Thu", 0)
b_a = bgc_data.get("TAA", 0)
# Mobilome: g(m) = ln(1+m)
m = mobilome_data.get("mobile_elements_count", 0)
g_m = math.log(1.0 + m)
# CRISPR: h(c) = 1 - c/2 (0->1, 1->0.5, 2->0)
# c in [0, 1, 2]
c = crispr_state
h_c = 1.0 - (c / 2.0)
delta = (betas["z"] * b_z) + \
(betas["t"] * b_t) + \
(betas["a"] * b_a) + \
(betas["m"] * g_m) + \
(betas["c"] * h_c)
final_logit = logit_p + delta
return 1.0 / (1.0 + math.exp(-final_logit))
def score_strain(
strain: str,
sdf: pd.DataFrame,
index: SpecificityIndex,
crispr_associations: Dict[str, Any] = None,
crispr_weight: float = 0.0,
context_data: Dict[str, Any] = None,
betas: Dict[str, float] = None
) -> Tuple[List[ToxinHit], StrainScores, Optional[StrainSpeciesScores]]:
# Include special buckets per requirements
order_set = sorted({*index.all_orders, "other", "unknown"})
per_hit: List[ToxinHit] = []
@@ -477,6 +529,14 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
if not fulfilled:
w *= 0.2 # partner penalty if required but not present
# CRISPR Boost (Hit-level)
if crispr_associations and crispr_weight > 0:
# Check if this toxin is associated with CRISPR
# keys in crispr_associations are toxin names
assoc = crispr_associations.get(hit_id) or crispr_associations.get(name_key)
if assoc:
w = min(1.0, w + crispr_weight)
contribs: Dict[str, float] = {}
for o in order_set:
p = float(order_dist.get(o, 0.0))
@@ -523,7 +583,21 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
)
)
# 1. Calculate S_tox (Noisy-OR)
order_scores = {o: (1.0 - one_minus[o]) for o in order_set}
# 2. Apply Logit Prior if context data is present
if context_data and betas:
bgc = context_data.get("bgc", {})
mobi = context_data.get("mobilome", {})
crispr_st = context_data.get("crispr_state", 0)
for o in order_scores:
s_tox = order_scores[o]
if s_tox > 0:
s_final = apply_logit_prior(s_tox, bgc, mobi, crispr_st, betas)
order_scores[o] = s_final
# choose top order excluding unresolved buckets if possible
preferred = [o for o in index.all_orders if o in order_scores]
if not preferred:
@@ -538,6 +612,20 @@ def score_strain(strain: str, sdf: pd.DataFrame, index: SpecificityIndex) -> Tup
species_scores_obj: Optional[StrainSpeciesScores] = None
if species_set:
species_scores = {sp: (1.0 - sp_one_minus[sp]) for sp in species_set}
# Apply Logit Prior to Species scores too?
# The math doc focuses on "Strain x Order", but presumably it applies to species too if we follow the logic.
# "combine evidence... then prior". Let's apply it for consistency.
if context_data and betas:
bgc = context_data.get("bgc", {})
mobi = context_data.get("mobilome", {})
crispr_st = context_data.get("crispr_state", 0)
for sp in species_scores:
s_tox = species_scores[sp]
if s_tox > 0:
s_final = apply_logit_prior(s_tox, bgc, mobi, crispr_st, betas)
species_scores[sp] = s_final
if species_scores:
top_sp = max(species_set, key=lambda sp: species_scores.get(sp, 0.0))
top_sp_score = float(species_scores.get(top_sp, 0.0))
@@ -635,8 +723,87 @@ def main():
help="Keep only hits that map to a known name/subfamily/family in the specificity index")
ap.add_argument("--min_identity", type=float, default=0.0, help="Minimum identity (0-1) to keep a hit")
ap.add_argument("--min_coverage", type=float, default=0.0, help="Minimum coverage (0-1) to keep a hit")
# CRISPR Integration
ap.add_argument("--crispr_results", type=Path, default=None, help="Path to CRISPR Fusion analysis results (JSON) for hit-level boost")
ap.add_argument("--crispr_fusion", action="store_true", help="Use fusion analysis results for stronger evidence")
ap.add_argument("--crispr_weight", type=float, default=0.1, help="Weight boost for CRISPR-associated toxins (0-1)")
# Genome Context Priors (Strain-level)
ap.add_argument("--context_bgc", type=Path, default=None, help="Path to BGC detection results (JSON)")
ap.add_argument("--context_mobilome", type=Path, default=None, help="Path to Mobilome analysis results (JSON)")
ap.add_argument("--context_crispr", type=Path, default=None, help="Path to CRISPR detection results (JSON) for strain-level prior")
# Prior Weights (Defaults from math doc)
ap.add_argument("--beta_z", type=float, default=0.5, help="Weight for ZWA BGC")
ap.add_argument("--beta_t", type=float, default=0.5, help="Weight for Thu BGC")
ap.add_argument("--beta_a", type=float, default=0.5, help="Weight for TAA BGC")
ap.add_argument("--beta_m", type=float, default=0.5, help="Weight for Mobilome")
ap.add_argument("--beta_c", type=float, default=0.5, help="Weight for CRISPR state")
args = ap.parse_args()
# Load CRISPR data if available
crispr_associations = {}
if args.crispr_results and args.crispr_results.exists():
try:
with open(args.crispr_results) as f:
cdata = json.load(f)
# If fusion analysis results (has 'associations')
if "associations" in cdata:
for assoc in cdata["associations"]:
toxin_name = assoc.get("toxin")
if toxin_name:
# Normalize name if possible or keep as is.
# Digger outputs might have variants, but fusion usually uses specific names.
crispr_associations[toxin_name] = assoc
print(f"[Shoter] Loaded {len(crispr_associations)} CRISPR associations")
except Exception as e:
print(f"[Shoter] Failed to load CRISPR results: {e}")
# Load Prior Data
context_data = {
"bgc": {},
"mobilome": {},
"crispr_state": 0
}
# BGC
if args.context_bgc and args.context_bgc.exists():
try:
with open(args.context_bgc) as f:
context_data["bgc"] = json.load(f)
print(f"[Shoter] Loaded BGC context: {context_data['bgc']}")
except Exception as e:
print(f"[Shoter] Failed to load BGC context: {e}")
# Mobilome
if args.context_mobilome and args.context_mobilome.exists():
try:
with open(args.context_mobilome) as f:
context_data["mobilome"] = json.load(f)
print(f"[Shoter] Loaded Mobilome context: {context_data['mobilome']}")
except Exception as e:
print(f"[Shoter] Failed to load Mobilome context: {e}")
# CRISPR State
if args.context_crispr and args.context_crispr.exists():
try:
with open(args.context_crispr) as f:
cdata = json.load(f)
context_data["crispr_state"] = cdata.get("crispr_state", 0)
print(f"[Shoter] Loaded CRISPR state: {context_data['crispr_state']}")
except Exception as e:
print(f"[Shoter] Failed to load CRISPR context: {e}")
betas = {
"z": args.beta_z,
"t": args.beta_t,
"a": args.beta_a,
"m": args.beta_m,
"c": args.beta_c
}
index = SpecificityIndex.from_csv(args.toxicity_csv)
df = parse_all_toxins(args.all_toxins)
@@ -672,7 +839,13 @@ def main():
for strain in strains:
sdf = df[df["Strain"].astype(str).eq(strain)].copy()
per_hit, sscore, sspecies = score_strain(strain, sdf, index)
per_hit, sscore, sspecies = score_strain(
strain, sdf, index,
crispr_associations=crispr_associations,
crispr_weight=args.crispr_weight if args.crispr_fusion else 0.0,
context_data=context_data,
betas=betas
)
all_hits.extend(per_hit)
all_strain_scores.append(sscore)
if sspecies is not None: