feat: 支持绑定外部 bt_toxin 数据库 (2025-11-04 更新)

- docker_client.py: run_bttoxin_digger() 新增 bttoxin_db_dir 参数,支持挂载外部数据库
- run_single_fna_pipeline.py: 新增 --bttoxin_db_dir 参数,自动检测 external_dbs/bt_toxin
- README.md: 添加 bttoxin_db 更新说明和 Docker 绑定文档
- external_dbs/bt_toxin: 添加 2025-11-04 版本数据库文件

测试验证: HAN055 样本毒素命名版本号变化 (Cry2Aa9→22, Cry2Ab35→41, Cry1Ia40→42, Vip3Aa7→79)
This commit is contained in:
2026-01-04 14:37:49 +08:00
parent 5883e13c56
commit 1c0e8f90a5
40 changed files with 166422 additions and 194 deletions

View File

@@ -16,14 +16,20 @@ Notes
- Digger is executed in a container (root in container); files may be owned by root on host.
We write everything into <out_root>/digger to keep permissions/locality predictable.
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在),覆盖容器内置旧库。
Example
python scripts/run_single_fna_pipeline.py \
--fna tests/test_data/C15.fna \
--toxicity_csv Data/toxicity-data.csv \
--out_root runs/C15_run \
--min_identity 0.50 --min_coverage 0.60 \
python scripts/run_single_fna_pipeline.py \\
--fna tests/test_data/HAN055.fna \\
--toxicity_csv Data/toxicity-data.csv \\
--out_root runs/HAN055_run \\
--min_identity 0.50 --min_coverage 0.60 \\
--disallow_unknown_families --require_index_hit --lang zh
# 使用自定义数据库路径
python scripts/run_single_fna_pipeline.py \\
--fna tests/test_data/HAN055.fna \\
--bttoxin_db_dir /path/to/custom/bt_toxin
"""
from __future__ import annotations
@@ -73,11 +79,27 @@ def run_single_fna_pipeline(
allow_unknown_families: bool = True,
require_index_hit: bool = False,
lang: str = "zh",
bttoxin_db_dir: Path | None = None,
) -> Dict[str, Any]:
"""运行单个 fna 文件的完整 pipeline。
Args:
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None则自动检测
项目根目录下的 external_dbs/bt_toxin。
"""
fna_path = fna_path.resolve()
out_root = out_root.resolve()
out_root.mkdir(parents=True, exist_ok=True)
# 自动检测外部数据库
if bttoxin_db_dir is None:
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
if default_db.exists() and (default_db / "db").exists():
bttoxin_db_dir = default_db
print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}")
else:
print("[pipeline] 未找到外部数据库,将使用容器内置数据库(可能较旧)")
digger_dir = out_root / "digger"
shotter_dir = out_root / "shotter"
logs_dir = out_root / "logs"
@@ -98,6 +120,7 @@ def run_single_fna_pipeline(
sequence_type="nucl",
scaf_suffix=fna_path.suffix or ".fna",
threads=4,
bttoxin_db_dir=bttoxin_db_dir,
)
if not result.get("success"):
return {
@@ -197,6 +220,8 @@ def main() -> int:
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
ap.add_argument("--require_index_hit", action="store_true", default=False)
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin")
args = ap.parse_args()
# derive per-run default out_root using file stem
@@ -215,6 +240,7 @@ def main() -> int:
allow_unknown_families=not args.disallow_unknown_families,
require_index_hit=args.require_index_hit,
lang=args.lang,
bttoxin_db_dir=args.bttoxin_db_dir,
)
if not res.get("ok"):