Files
bttoxin-pipeline/scripts/download_bpprc_data.py

57 lines
1.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""从 BPPRC/NCBI 下载测试数据"""
import os
import argparse
from pathlib import Path
from Bio import Entrez, SeqIO
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Entrez.email = "your_email@example.com"
TEST_GENOMES = {
'Bacillus_thuringiensis_HD-73': 'NZ_CP004069.1',
'Bacillus_thuringiensis_YBT-1520': 'NZ_CP003889.1',
'Bacillus_thuringiensis_BMB171': 'NC_014171.1',
}
def download_genome(accession, output_file):
"""下载基因组"""
try:
logger.info(f"Downloading {accession}...")
handle = Entrez.efetch(
db="nucleotide",
id=accession,
rettype="fasta",
retmode="text"
)
with open(output_file, 'w') as f:
f.write(handle.read())
handle.close()
logger.info(f"✓ Downloaded: {output_file}")
return True
except Exception as e:
logger.error(f"✗ Failed: {e}")
return False
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output-dir', default='tests/test_data/genomes')
parser.add_argument('--email', required=True)
args = parser.parse_args()
Entrez.email = args.email
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
for name, accession in TEST_GENOMES.items():
output_file = output_dir / f"{name}.fna"
download_genome(accession, output_file)
if __name__ == '__main__':
main()