feat: 支持绑定外部 bt_toxin 数据库 (2025-11-04 更新)

- docker_client.py: run_bttoxin_digger() 新增 bttoxin_db_dir 参数,支持挂载外部数据库
- run_single_fna_pipeline.py: 新增 --bttoxin_db_dir 参数,自动检测 external_dbs/bt_toxin
- README.md: 添加 bttoxin_db 更新说明和 Docker 绑定文档
- external_dbs/bt_toxin: 添加 2025-11-04 版本数据库文件

测试验证: HAN055 样本毒素命名版本号变化 (Cry2Aa9→22, Cry2Ab35→41, Cry1Ia40→42, Vip3Aa7→79)
This commit is contained in:
2026-01-04 14:37:49 +08:00
parent 5883e13c56
commit 1c0e8f90a5
40 changed files with 166422 additions and 194 deletions

View File

@@ -140,6 +140,86 @@ tests/output/
└── BtToxin_Digger.log
```
## bttoxin_db更新
BtToxin_Digger 容器内置的数据库版本较旧2021年8月建议使用官方 GitHub 仓库的最新数据库。
### 数据库目录结构
```
external_dbs/bt_toxin/
├── db/ # BLAST 索引文件(运行时必需)
│ ├── bt_toxin.phr
│ ├── bt_toxin.pin
│ ├── bt_toxin.psq
│ ├── bt_toxin.pdb
│ ├── bt_toxin.pjs
│ ├── bt_toxin.pot
│ ├── bt_toxin.ptf
│ ├── bt_toxin.pto
│ └── old/
└── seq/ # 序列源文件(留档/更新用)
├── bt_toxin20251104.fas
└── ...
```
### 更新步骤
```bash
mkdir -p external_dbs
rm -rf external_dbs/bt_toxin tmp_bttoxin_repo
git clone --filter=blob:none --no-checkout https://github.com/liaochenlanruo/BtToxin_Digger.git tmp_bttoxin_repo
cd tmp_bttoxin_repo
git sparse-checkout init --cone
git sparse-checkout set BTTCMP_db/bt_toxin
git checkout master
# 把目录拷贝到你的项目 external_dbs 下
cd ..
cp -a tmp_bttoxin_repo/BTTCMP_db/bt_toxin external_dbs/bt_toxin
# 清理临时 repo
rm -rf tmp_bttoxin_repo
```
### 验证数据库绑定
```bash
# 检查数据库文件是否完整
ls -lh external_dbs/bt_toxin/db/
# 验证容器能正确访问绑定的数据库
docker run --rm \
-v "$(pwd)/external_dbs/bt_toxin:/usr/local/bin/BTTCMP_db/bt_toxin:ro" \
quay.io/biocontainers/bttoxin_digger:1.0.10--hdfd78af_0 \
bash -lc 'ls -lh /usr/local/bin/BTTCMP_db/bt_toxin/db | head'
```
输出应显示 `.pin/.psq/.phr` 等文件,且时间戳/大小与宿主机一致,说明绑定成功。
### 使用外部数据库运行 Pipeline
脚本会自动检测 `external_dbs/bt_toxin` 目录,若存在则自动绑定:
```bash
# 自动使用 external_dbs/bt_toxin推荐
uv run python scripts/run_single_fna_pipeline.py --fna tests/test_data/HAN055.fna
# 或手动指定数据库路径
uv run python scripts/run_single_fna_pipeline.py \
--fna tests/test_data/HAN055.fna \
--bttoxin_db_dir /path/to/custom/bt_toxin
```
### 注意事项
- **db/ 目录是必需的**:运行时 BLAST 只读取 `db/` 下的索引文件
- **seq/ 目录是可选的**:仅用于留档或重新生成索引
- **绑定模式为只读 (ro)**:防止容器意外修改宿主机数据库
- **不需要重新 index**GitHub 仓库已包含预构建的 BLAST 索引
## License
MIT License

View File

@@ -151,9 +151,19 @@ class DockerContainerManager:
sequence_type: str = "nucl",
scaf_suffix: str = ".fna",
threads: int = 4,
bttoxin_db_dir: Optional[Path] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""在容器中运行 BtToxin_Digger 主分析(单目录方案)。"""
"""在容器中运行 BtToxin_Digger 主分析(单目录方案)。
Args:
bttoxin_db_dir: 外部 bt_toxin 数据库目录路径(可选)。
若提供,将绑定到容器内 /usr/local/bin/BTTCMP_db/bt_toxin
覆盖容器内置的旧数据库。目录结构应为:
bt_toxin/
├── db/ (BLAST 索引文件)
└── seq/ (序列源文件)
"""
# 1) 在宿主输出目录下准备 input_files并复制输入文件
work_input_dir = (output_dir / "input_files").resolve()
@@ -239,12 +249,24 @@ class DockerContainerManager:
if kwargs.get("assemble_only"):
base_cmd.append("--assemble_only")
# 2) 挂载输出目录(含 input_files日志目录
# 2) 挂载输出目录(含 input_files日志目录、以及可选的外部数据库
volumes = {
str(output_dir.resolve()): {"bind": "/workspace", "mode": "rw"},
str(log_dir.resolve()): {"bind": "/data/logs", "mode": "rw"},
}
# 绑定外部 bt_toxin 数据库(覆盖容器内置旧库)
if bttoxin_db_dir is not None:
db_path = Path(bttoxin_db_dir).resolve()
if db_path.exists() and (db_path / "db").exists():
volumes[str(db_path)] = {
"bind": "/usr/local/bin/BTTCMP_db/bt_toxin",
"mode": "ro",
}
logger.info(f"绑定外部数据库: {db_path} -> /usr/local/bin/BTTCMP_db/bt_toxin")
else:
logger.warning(f"外部数据库目录不存在或结构不完整: {bttoxin_db_dir}")
logger.info("开始 BtToxin_Digger 分析...")
final_cmd = base_cmd

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,22 @@
{
"version": "1.2",
"dbname": "bt_toxin",
"dbtype": "Protein",
"db-version": 5,
"description": "bt_toxin20251104.fas",
"number-of-letters": 996368,
"number-of-sequences": 1199,
"last-updated": "2025-11-04T15:35:00",
"number-of-volumes": 1,
"bytes-total": 1149077,
"bytes-to-cache": 1007264,
"files": [
"bt_toxin.pdb",
"bt_toxin.phr",
"bt_toxin.pin",
"bt_toxin.pot",
"bt_toxin.psq",
"bt_toxin.ptf",
"bt_toxin.pto"
]
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
def get_unique_headers(file_path):
"""读取文件中以'>'开头的行,返回'>'后面内容的集合"""
headers = set()
with open(file_path, 'r') as f:
for line in f:
line = line.strip()
if line.startswith('>'):
# 提取'>'后面的内容(包括可能的空格和其他字符)
header = line[1:]
headers.add(header)
return headers
# 输入文件路径
file1 = 'bt_toxin20251104.fas'
file2 = 'all_app_cry_cyt_gpp_mcf_mpf_mpp_mtx_pra_prb_spp_tpp_txp_vip_vpa_vpb_xpp_fasta_sequences.txt'
output_file = 'unique_headers.txt'
# 获取两个文件中的header集合
headers1 = get_unique_headers(file1)
headers2 = get_unique_headers(file2)
# 计算各自独有的header
unique_to_file1 = headers1 - headers2
unique_to_file2 = headers2 - headers1
# 写入输出文件
with open(output_file, 'w') as out_f:
out_f.write(f"### Unique headers in {file1} ###\n")
for header in sorted(unique_to_file1):
out_f.write(f">{header}\n")
out_f.write(f"\n### Unique headers in {file2} ###\n")
for header in sorted(unique_to_file2):
out_f.write(f">{header}\n")
print(f"处理完成,结果已保存至 {output_file}")

View File

@@ -1,12 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>BtToxin Pipeline</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="/src/main.js"></script>
</body>
</html>

View File

@@ -1,22 +0,0 @@
{
"name": "bttoxin-frontend",
"version": "1.0.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "vite build",
"preview": "vite preview"
},
"dependencies": {
"vue": "^3.5.13",
"vue-router": "^4.5.0",
"pinia": "^2.3.0",
"axios": "^1.7.9",
"naive-ui": "^2.40.1",
"@vicons/ionicons5": "^0.12.0"
},
"devDependencies": {
"@vitejs/plugin-vue": "^5.2.1",
"vite": "^6.0.5"
}
}

View File

@@ -1,41 +0,0 @@
<template>
<n-config-provider :theme="darkTheme">
<n-layout>
<n-layout-header bordered>
<n-space align="center" justify="space-between" style="padding: 16px">
<n-h2>BtToxin Pipeline</n-h2>
<n-menu mode="horizontal" :options="menuOptions" />
</n-space>
</n-layout-header>
<n-layout-content style="padding: 24px">
<router-view />
</n-layout-content>
<n-layout-footer bordered style="padding: 16px; text-align: center">
BtToxin Pipeline © 2025
</n-layout-footer>
</n-layout>
</n-config-provider>
</template>
<script setup>
import { darkTheme } from 'naive-ui'
import { h } from 'vue'
import { RouterLink } from 'vue-router'
const menuOptions = [
{
label: () => h(RouterLink, { to: '/' }, { default: () => 'Home' }),
key: 'home'
},
{
label: () => h(RouterLink, { to: '/upload' }, { default: () => 'Upload' }),
key: 'upload'
},
{
label: () => h(RouterLink, { to: '/jobs' }, { default: () => 'Jobs' }),
key: 'jobs'
}
]
</script>

View File

@@ -1,14 +0,0 @@
import { createApp } from 'vue'
import { createPinia } from 'pinia'
import naive from 'naive-ui'
import App from './App.vue'
import router from './router'
const app = createApp(App)
const pinia = createPinia()
app.use(pinia)
app.use(router)
app.use(naive)
app.mount('#app')

View File

@@ -1,31 +0,0 @@
import { createRouter, createWebHistory } from 'vue-router'
const routes = [
{
path: '/',
name: 'Home',
component: () => import('./views/Home.vue')
},
{
path: '/upload',
name: 'Upload',
component: () => import('./views/Upload.vue')
},
{
path: '/jobs',
name: 'Jobs',
component: () => import('./views/Jobs.vue')
},
{
path: '/jobs/:id',
name: 'JobDetail',
component: () => import('./views/JobDetail.vue')
}
]
const router = createRouter({
history: createWebHistory(),
routes
})
export default router

View File

@@ -1,22 +0,0 @@
import axios from 'axios'
const api = axios.create({
baseURL: '/api/v1',
timeout: 30000
})
export default {
createJob(formData) {
return api.post('/jobs/create', formData, {
headers: { 'Content-Type': 'multipart/form-data' }
})
},
getJob(jobId) {
return api.get(`/jobs/${jobId}`)
},
getJobProgress(jobId) {
return api.get(`/jobs/${jobId}/progress`)
}
}

View File

@@ -1,7 +0,0 @@
<template>
<n-space vertical size="large">
<n-card title="Welcome to BtToxin Pipeline">
<p>Automated Bacillus thuringiensis toxin mining system</p>
</n-card>
</n-space>
</template>

View File

@@ -1,5 +0,0 @@
<template>
<n-card title="Job Details">
<p>Job ID: {{ $route.params.id }}</p>
</n-card>
</template>

View File

@@ -1,5 +0,0 @@
<template>
<n-card title="Job List">
<n-empty description="No jobs yet" />
</n-card>
</template>

View File

@@ -1,7 +0,0 @@
<template>
<n-card title="Upload Genome Files">
<n-upload multiple>
<n-button>Select Files</n-button>
</n-upload>
</n-card>
</template>

View File

@@ -1,21 +0,0 @@
import { defineConfig } from 'vite'
import vue from '@vitejs/plugin-vue'
import { fileURLToPath, URL } from 'node:url'
export default defineConfig({
plugins: [vue()],
resolve: {
alias: {
'@': fileURLToPath(new URL('./src', import.meta.url))
}
},
server: {
port: 3000,
proxy: {
'/api': {
target: 'http://localhost:8000',
changeOrigin: true
}
}
}
})

View File

@@ -16,14 +16,20 @@ Notes
- Digger is executed in a container (root in container); files may be owned by root on host.
We write everything into <out_root>/digger to keep permissions/locality predictable.
- This script exposes CLI flags for Shotter filters to allow strict/loose runs.
- 默认使用 external_dbs/bt_toxin 作为外部数据库(若存在),覆盖容器内置旧库。
Example
python scripts/run_single_fna_pipeline.py \
--fna tests/test_data/C15.fna \
--toxicity_csv Data/toxicity-data.csv \
--out_root runs/C15_run \
--min_identity 0.50 --min_coverage 0.60 \
python scripts/run_single_fna_pipeline.py \\
--fna tests/test_data/HAN055.fna \\
--toxicity_csv Data/toxicity-data.csv \\
--out_root runs/HAN055_run \\
--min_identity 0.50 --min_coverage 0.60 \\
--disallow_unknown_families --require_index_hit --lang zh
# 使用自定义数据库路径
python scripts/run_single_fna_pipeline.py \\
--fna tests/test_data/HAN055.fna \\
--bttoxin_db_dir /path/to/custom/bt_toxin
"""
from __future__ import annotations
@@ -73,11 +79,27 @@ def run_single_fna_pipeline(
allow_unknown_families: bool = True,
require_index_hit: bool = False,
lang: str = "zh",
bttoxin_db_dir: Path | None = None,
) -> Dict[str, Any]:
"""运行单个 fna 文件的完整 pipeline。
Args:
bttoxin_db_dir: 外部 bt_toxin 数据库目录。若为 None则自动检测
项目根目录下的 external_dbs/bt_toxin。
"""
fna_path = fna_path.resolve()
out_root = out_root.resolve()
out_root.mkdir(parents=True, exist_ok=True)
# 自动检测外部数据库
if bttoxin_db_dir is None:
default_db = Path(__file__).resolve().parents[1] / "external_dbs" / "bt_toxin"
if default_db.exists() and (default_db / "db").exists():
bttoxin_db_dir = default_db
print(f"[pipeline] 使用外部数据库: {bttoxin_db_dir}")
else:
print("[pipeline] 未找到外部数据库,将使用容器内置数据库(可能较旧)")
digger_dir = out_root / "digger"
shotter_dir = out_root / "shotter"
logs_dir = out_root / "logs"
@@ -98,6 +120,7 @@ def run_single_fna_pipeline(
sequence_type="nucl",
scaf_suffix=fna_path.suffix or ".fna",
threads=4,
bttoxin_db_dir=bttoxin_db_dir,
)
if not result.get("success"):
return {
@@ -197,6 +220,8 @@ def main() -> int:
ap.add_argument("--disallow_unknown_families", action="store_true", default=False)
ap.add_argument("--require_index_hit", action="store_true", default=False)
ap.add_argument("--lang", type=str, choices=["zh", "en"], default="zh")
ap.add_argument("--bttoxin_db_dir", type=Path, default=None,
help="外部 bt_toxin 数据库目录路径(默认自动检测 external_dbs/bt_toxin")
args = ap.parse_args()
# derive per-run default out_root using file stem
@@ -215,6 +240,7 @@ def main() -> int:
allow_unknown_families=not args.disallow_unknown_families,
require_index_hit=args.require_index_hit,
lang=args.lang,
bttoxin_db_dir=args.bttoxin_db_dir,
)
if not res.get("ok"):