#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :main.py @Description: : foldseek quick search for protein structure alignment @Date :2024/03/11 15:17:21 @Author :lyzeng @Email :pylyzeng@gmail.com @version :1.0 ''' import subprocess from pathlib import Path import pandas as pd import shutil import attrs import argparse from typing import Optional @attrs.define class FoldseekComparer: pdb_file: Path = attrs.field(converter=Path, validator=attrs.validators.instance_of(Path)) # 指定的PDB文件 pdb_dir: Path = attrs.field(converter=Path, validator=attrs.validators.instance_of(Path)) # 包含PDB文件的目录 output_dir: Path = attrs.field(factory=lambda: Path('compare_out'), validator=attrs.validators.instance_of(Path)) output_format: str = attrs.field(default='xlsx', validator=attrs.validators.in_(['xlsx', 'csv'])) def __attrs_post_init__(self): """ Create the output directory if it doesn't exist. """ self.output_dir.mkdir(parents=True, exist_ok=True) @staticmethod def check_foldseek_path() -> Optional[str]: """ Check if Foldseek is available in the system PATH. """ foldseek_path = shutil.which("foldseek") if foldseek_path is None: raise FileNotFoundError("Foldseek is not found in the system PATH. Please install Foldseek and add it to the PATH environment variable.") return foldseek_path def compare_with_directory(self) -> None: """ Compare the specified PDB file with all PDB files in the specified directory. """ foldseek_path = self.check_foldseek_path() for target_pdb in self.pdb_dir.glob('*.pdb'): # Skip the comparison if the target PDB is the same as the specified PDB if target_pdb.resolve() == self.pdb_file.resolve(): continue oup = self.output_dir / f"{self.pdb_file.stem}_vs_{target_pdb.stem}" cmd = f"{foldseek_path} easy-search '{self.pdb_file}' '{target_pdb}' '{oup}' tmp --format-output query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore" result = subprocess.run(cmd, shell=True, capture_output=True, text=True) if result.returncode != 0: print(f"Error executing foldseek: {result.stderr}") continue output_file = f"{oup}" if Path(output_file).exists(): col_names = 'query,target,fident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits,prob,lddt,alntmscore'.split(',') df_aln = pd.read_table(output_file, names=col_names) if self.output_format == 'xlsx': df_aln.to_excel(f"{oup}.xlsx", index=False, engine='openpyxl') elif self.output_format == 'csv': df_aln.to_csv(f"{oup}.csv", index=False) else: print(f"Expected output file not found: {output_file}") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Compare a specified PDB file with all PDB files in a directory.') parser.add_argument('pdb_file', type=Path, help='Path to the specified PDB file to compare') parser.add_argument('pdb_dir', type=Path, help='Path to the directory containing PDB files for comparison') parser.add_argument('-o', '--output_dir', type=Path, default=Path('compare_out'), help='Path to the output directory (default: compare_out)') parser.add_argument('-f', '--format', choices=['xlsx', 'csv'], default='xlsx', help='Output file format (default: xlsx)') args = parser.parse_args() comparer = FoldseekComparer(pdb_file=args.pdb_file, pdb_dir=args.pdb_dir, output_dir=args.output_dir, output_format=args.format) comparer.compare_with_directory()