update

2024-01-16 15:38:51 +08:00
parent 66062bead7
commit bcf5445f04
1 changed files with 41 additions and 70 deletions
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -20,6 +20,7 @@ from biopandas.pdb import PandasPdb
 from pathlib import Path
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
+from Bio.Align import PairwiseAligner
 import requests
 from copy import deepcopy
 from pymol import cmd
@@ -285,7 +286,46 @@ class PDBAnalyzer:
            raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")

    def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
+        """ 
+        Filter sequences from a FASTA file based on a specific chain ID. 
+
+        This function is designed to work with FASTA files containing single polypeptide chains (monomers). 
+        It filters the sequences by matching the specified chain ID with the descriptions in the FASTA file. 
+
+        Args:
+        chain_id (str): The chain ID to be used for filtering the sequences.
+
+        Returns:
+        List[SeqRecord]: A list of SeqRecord objects corresponding to the specified chain ID.
+
+        Note:
+        This method assumes that the FASTA file contains sequences of individual chains (monomers) only. 
+        It may not work correctly if the FASTA file contains sequences from multimeric proteins (with multiple chains combined).
+        """
        return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
+
+    
+    def find_most_similar(self, input_seq: str) -> str:
+        """ 
+        Find the most similar sequence in the FASTA file to the given input sequence.
+
+        Args:
+        input_seq (str): The protein sequence to compare against sequences in the FASTA file.
+
+        Returns:
+        str: The most similar sequence found in the FASTA file.
+        """
+        aligner = PairwiseAligner()
+        max_score = -1
+        most_similar_seq = None
+
+        for record in self.read_fasta():
+            score = aligner.score(input_seq, str(record.seq))
+            if score > max_score:
+                max_score = score
+                most_similar_seq = record.seq
+
+        return most_similar_seq
    
    def read_fasta(self) -> SeqRecord:
        fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
@@ -348,7 +388,7 @@ def main(PDB_ID, PDB_file_path):
    for mc in missing_info:
        out_file = f'5sws_{mc}.pdb'
        analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
-        mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
+        mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file # single polypeptide chains (monomers). 
        if len(mc_fasta) == 1:
            mc_fasta = mc_fasta[0]
            out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
@@ -424,75 +464,6 @@ for j in analyzer.chain_id_list:
    split_dict[j]=fn.read_text()
 '''

-def fix_all(path:Path):
-    pdbfiles = [i for i in path.glob('*.pdb')]
-    for i in pdbfiles:
-        PDB_file_path = i
-        PDB_ID = i.stem
-        analyzer = PDBAnalyzer(PDB_file_path)
-        sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
-        print(f'Residues info for {PDB_ID}: \n',sequences)
-        missing_info = analyzer.extract_sequences_info()
-        print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
-        # 示例: 使用biopython提取A链（将会保留HETATM）
-        chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
-        chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
-        # 示例: 使用biopandas提取A链（将不会保留HETATM）
-        chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
-        chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
-        # A链改B链, 并分割保存为单独文件
-        analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
-        # 分割所有的链
-        split_dict = {}
-        for j in analyzer.chain_id_list:
-            fn = Path(f'{PDB_ID}_{j}.pdb')
-            analyzer.split_chain(j).to_pdb(fn.as_posix())
-            split_dict[j]=fn.read_text()
-        # 修复loop区域
-        from build_modeller import PDBModeler
-        from modeller import ModellerError
-        mc_dict = {}
-        for mc in missing_info:
-            out_file = f'5sws_{mc}.pdb'
-            analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
-            mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
-            if len(mc_fasta) == 1:
-                mc_fasta = mc_fasta[0]
-                out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
-                analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
-                print(f'>{mc_fasta.description}')
-                print(mc_fasta.seq)
-                modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
-                try:
-                    modeller_results = modeller.make_model()
-                except ModellerError:
-                    print(f'Failed to build model for chain {mc}')
-                    print(f'No loops detected in {out_fasta_file.name}')
-                    print(f'may pdb file sequence is not correct')
-                    continue
-                except Exception as e:
-                    raise e
-                print(f'Model files: {[file.name for file in modeller_results]}')
-                # change id to original
-                for i in modeller_results:
-                    manalyzer = PDBAnalyzer(i)
-                    manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
-                if len(modeller_results) == 1:
-                    # use pymol to align
-                    aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
-                    pdbstr = aligner.align()
-                    mc_dict[mc] = pdbstr
-                else:
-                    print('more than one model file, please set num_loop to 1')
-            elif len(mc_fasta) == 0:
-                continue
-            else:
-                raise ValueError(f'only can fix one chain content: {mc_fasta}')
-
-        # 使用示例
-        split_dict.update(mc_dict)  # 更新 split_dict
-        import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
-
 if __name__ == "__main__":
    # import argparse
    # parser = argparse.ArgumentParser(description="Build model by Modeller")