diff --git a/analysis_pdb.py b/analysis_pdb.py index ace84b7..37aa257 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -20,6 +20,7 @@ from biopandas.pdb import PandasPdb from pathlib import Path from Bio.SeqRecord import SeqRecord from Bio import SeqIO +from Bio.Align import PairwiseAligner import requests from copy import deepcopy from pymol import cmd @@ -285,7 +286,46 @@ class PDBAnalyzer: raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}") def filter_sequences(self, chain_id: str) -> List[SeqRecord]: + """ + Filter sequences from a FASTA file based on a specific chain ID. + + This function is designed to work with FASTA files containing single polypeptide chains (monomers). + It filters the sequences by matching the specified chain ID with the descriptions in the FASTA file. + + Args: + chain_id (str): The chain ID to be used for filtering the sequences. + + Returns: + List[SeqRecord]: A list of SeqRecord objects corresponding to the specified chain ID. + + Note: + This method assumes that the FASTA file contains sequences of individual chains (monomers) only. + It may not work correctly if the FASTA file contains sequences from multimeric proteins (with multiple chains combined). + """ return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta())) + + + def find_most_similar(self, input_seq: str) -> str: + """ + Find the most similar sequence in the FASTA file to the given input sequence. + + Args: + input_seq (str): The protein sequence to compare against sequences in the FASTA file. + + Returns: + str: The most similar sequence found in the FASTA file. + """ + aligner = PairwiseAligner() + max_score = -1 + most_similar_seq = None + + for record in self.read_fasta(): + score = aligner.score(input_seq, str(record.seq)) + if score > max_score: + max_score = score + most_similar_seq = record.seq + + return most_similar_seq def read_fasta(self) -> SeqRecord: fasta_file = self.pdb_file.parent / f"{self.pid}.fasta" @@ -348,7 +388,7 @@ def main(PDB_ID, PDB_file_path): for mc in missing_info: out_file = f'5sws_{mc}.pdb' analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file - mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file + mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file # single polypeptide chains (monomers). if len(mc_fasta) == 1: mc_fasta = mc_fasta[0] out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') @@ -424,75 +464,6 @@ for j in analyzer.chain_id_list: split_dict[j]=fn.read_text() ''' -def fix_all(path:Path): - pdbfiles = [i for i in path.glob('*.pdb')] - for i in pdbfiles: - PDB_file_path = i - PDB_ID = i.stem - analyzer = PDBAnalyzer(PDB_file_path) - sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 '' - print(f'Residues info for {PDB_ID}: \n',sequences) - missing_info = analyzer.extract_sequences_info() - print(f'Missing residues info for {PDB_ID}:\n {missing_info}') - # 示例: 使用biopython提取A链(将会保留HETATM) - chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' - chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 - # 示例: 使用biopandas提取A链(将不会保留HETATM) - chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' - chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 - # A链改B链, 并分割保存为单独文件 - analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') - # 分割所有的链 - split_dict = {} - for j in analyzer.chain_id_list: - fn = Path(f'{PDB_ID}_{j}.pdb') - analyzer.split_chain(j).to_pdb(fn.as_posix()) - split_dict[j]=fn.read_text() - # 修复loop区域 - from build_modeller import PDBModeler - from modeller import ModellerError - mc_dict = {} - for mc in missing_info: - out_file = f'5sws_{mc}.pdb' - analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file - mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file - if len(mc_fasta) == 1: - mc_fasta = mc_fasta[0] - out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') - analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) - print(f'>{mc_fasta.description}') - print(mc_fasta.seq) - modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast') - try: - modeller_results = modeller.make_model() - except ModellerError: - print(f'Failed to build model for chain {mc}') - print(f'No loops detected in {out_fasta_file.name}') - print(f'may pdb file sequence is not correct') - continue - except Exception as e: - raise e - print(f'Model files: {[file.name for file in modeller_results]}') - # change id to original - for i in modeller_results: - manalyzer = PDBAnalyzer(i) - manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) - if len(modeller_results) == 1: - # use pymol to align - aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb')) - pdbstr = aligner.align() - mc_dict[mc] = pdbstr - else: - print('more than one model file, please set num_loop to 1') - elif len(mc_fasta) == 0: - continue - else: - raise ValueError(f'only can fix one chain content: {mc_fasta}') - - # 使用示例 - split_dict.update(mc_dict) # 更新 split_dict - import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb') - if __name__ == "__main__": # import argparse # parser = argparse.ArgumentParser(description="Build model by Modeller")