update

2024-01-16 15:38:51 +08:00
parent 66062bead7
commit bcf5445f04
1 changed files with 41 additions and 70 deletions
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -20,6 +20,7 @@ from biopandas.pdb import PandasPdb
 from pathlib import Path
 from Bio.SeqRecord import SeqRecord
 from Bio import SeqIO
 from Bio.Align import PairwiseAligner
 import requests
 from copy import deepcopy
 from pymol import cmd
@@ -285,7 +286,46 @@ class PDBAnalyzer:
            raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
    def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
        """ 
        Filter sequences from a FASTA file based on a specific chain ID. 
        This function is designed to work with FASTA files containing single polypeptide chains (monomers). 
        It filters the sequences by matching the specified chain ID with the descriptions in the FASTA file. 
        Args:
        chain_id (str): The chain ID to be used for filtering the sequences.
        Returns:
        List[SeqRecord]: A list of SeqRecord objects corresponding to the specified chain ID.
        Note:
        This method assumes that the FASTA file contains sequences of individual chains (monomers) only. 
        It may not work correctly if the FASTA file contains sequences from multimeric proteins (with multiple chains combined).
        """
        return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
    def find_most_similar(self, input_seq: str) -> str:
        """ 
        Find the most similar sequence in the FASTA file to the given input sequence.
        Args:
        input_seq (str): The protein sequence to compare against sequences in the FASTA file.
        Returns:
        str: The most similar sequence found in the FASTA file.
        """
        aligner = PairwiseAligner()
        max_score = -1
        most_similar_seq = None
        for record in self.read_fasta():
            score = aligner.score(input_seq, str(record.seq))
            if score > max_score:
                max_score = score
                most_similar_seq = record.seq
        return most_similar_seq
    def read_fasta(self) -> SeqRecord:
        fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
@@ -348,7 +388,7 @@ def main(PDB_ID, PDB_file_path):
    for mc in missing_info:
        out_file = f'5sws_{mc}.pdb'
        analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
-        mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
+        mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file # single polypeptide chains (monomers). 
        if len(mc_fasta) == 1:
            mc_fasta = mc_fasta[0]
            out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
@@ -424,75 +464,6 @@ for j in analyzer.chain_id_list:
    split_dict[j]=fn.read_text()
 '''
 def fix_all(path:Path):
    pdbfiles = [i for i in path.glob('*.pdb')]
    for i in pdbfiles:
        PDB_file_path = i
        PDB_ID = i.stem
        analyzer = PDBAnalyzer(PDB_file_path)
        sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
        print(f'Residues info for {PDB_ID}: \n',sequences)
        missing_info = analyzer.extract_sequences_info()
        print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
        # 示例: 使用biopython提取A链（将会保留HETATM）
        chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
        chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
        # 示例: 使用biopandas提取A链（将不会保留HETATM）
        chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
        chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
        # A链改B链, 并分割保存为单独文件
        analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
        # 分割所有的链
        split_dict = {}
        for j in analyzer.chain_id_list:
            fn = Path(f'{PDB_ID}_{j}.pdb')
            analyzer.split_chain(j).to_pdb(fn.as_posix())
            split_dict[j]=fn.read_text()
        # 修复loop区域
        from build_modeller import PDBModeler
        from modeller import ModellerError
        mc_dict = {}
        for mc in missing_info:
            out_file = f'5sws_{mc}.pdb'
            analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
            mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
            if len(mc_fasta) == 1:
                mc_fasta = mc_fasta[0]
                out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
                analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
                print(f'>{mc_fasta.description}')
                print(mc_fasta.seq)
                modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
                try:
                    modeller_results = modeller.make_model()
                except ModellerError:
                    print(f'Failed to build model for chain {mc}')
                    print(f'No loops detected in {out_fasta_file.name}')
                    print(f'may pdb file sequence is not correct')
                    continue
                except Exception as e:
                    raise e
                print(f'Model files: {[file.name for file in modeller_results]}')
                # change id to original
                for i in modeller_results:
                    manalyzer = PDBAnalyzer(i)
                    manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
                if len(modeller_results) == 1:
                    # use pymol to align
                    aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
                    pdbstr = aligner.align()
                    mc_dict[mc] = pdbstr
                else:
                    print('more than one model file, please set num_loop to 1')
            elif len(mc_fasta) == 0:
                continue
            else:
                raise ValueError(f'only can fix one chain content: {mc_fasta}')
        # 使用示例
        split_dict.update(mc_dict)  # 更新 split_dict
        import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
 if __name__ == "__main__":
    # import argparse
    # parser = argparse.ArgumentParser(description="Build model by Modeller")