From 751f3280c947a246787a01da0e6439f13e2d6aeb Mon Sep 17 00:00:00 2001 From: hotwa Date: Thu, 11 Jan 2024 16:10:44 +0800 Subject: [PATCH] add cleanATOM --- analysis_pdb.py | 182 ++++++++++++++++++++++++++++++------------------ 1 file changed, 115 insertions(+), 67 deletions(-) diff --git a/analysis_pdb.py b/analysis_pdb.py index 9f5255a..ddcbeb7 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -9,7 +9,7 @@ @version :1.0 ''' # micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab -# modeller注册码:MODELIRANJE +# modeller注册码:MODELIRANJE (//lib/modeller-10.4/modlib/modeller/config.py) from dataclasses import dataclass, field from Bio.PDB import PDBParser from Bio.SeqUtils import seq1 @@ -23,6 +23,7 @@ from Bio import SeqIO import requests from copy import deepcopy from pymol import cmd +import os @dataclass class PDBAnalyzer: @@ -51,6 +52,31 @@ class PDBAnalyzer: self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + def cleanATOM(self, out_file=None, ext="_clean.pdb") -> Path: # from pyrosetta.toolbox import cleanATOM + """Extract all ATOM and TER records in a PDB file and write them to a new file. + + Args: + pdb_file (str): Path of the PDB file from which ATOM and TER records + will be extracted + out_file (str): Optional argument to specify a particular output filename. + Defaults to .clean.pdb. + ext (str): File extension to use for output file. Defaults to ".clean.pdb" + """ + pdb_file = self.path.as_posix() + # find all ATOM and TER lines + with open(pdb_file, "r") as fid: + good = [l for l in fid if l.startswith(("ATOM", "TER"))] + + # default output file to _clean.pdb + if out_file is None: + out_file = os.path.splitext(pdb_file)[0] + ext + + # write the selected records to a new file + with open(out_file, "w") as fid: + fid.writelines(good) + return Path(out_file) + + def check_continuity(self, chain, missing_char): """ Check the continuity of residues in a protein chain. @@ -370,6 +396,92 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file): # 保存合并后的对象 cmd.save(output_file, merged_object_name) +''' +# 示例: 使用biopython提取A链(将会保留HETATM) +chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' +chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 +# 示例: 使用biopandas提取A链(将不会保留HETATM) +chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' +chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 +# A链改B链, 并分割保存为单独文件 +analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') +# 分割所有的链 +split_dict = {} +for j in analyzer.chain_id_list: + fn = Path(f'{PDB_ID}_{j}.pdb') + analyzer.split_chain(j).to_pdb(fn.as_posix()) + split_dict[j]=fn.read_text() +''' + +def fix_all(path:Path): + pdbfiles = [i for i in path.glob('*.pdb')] + for i in pdbfiles: + PDB_file_path = i + PDB_ID = i.stem + analyzer = PDBAnalyzer(PDB_file_path) + sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 '' + print(f'Residues info for {PDB_ID}: \n',sequences) + missing_info = analyzer.extract_sequences_info() + print(f'Missing residues info for {PDB_ID}:\n {missing_info}') + # 示例: 使用biopython提取A链(将会保留HETATM) + chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 + # 示例: 使用biopandas提取A链(将不会保留HETATM) + chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 + # A链改B链, 并分割保存为单独文件 + analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') + # 分割所有的链 + split_dict = {} + for j in analyzer.chain_id_list: + fn = Path(f'{PDB_ID}_{j}.pdb') + analyzer.split_chain(j).to_pdb(fn.as_posix()) + split_dict[j]=fn.read_text() + # 修复loop区域 + from build_modeller import PDBModeler + from modeller import ModellerError + mc_dict = {} + for mc in missing_info: + out_file = f'5sws_{mc}.pdb' + analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file + mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file + if len(mc_fasta) == 1: + mc_fasta = mc_fasta[0] + out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') + analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) + print(f'>{mc_fasta.description}') + print(mc_fasta.seq) + modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast') + try: + modeller_results = modeller.make_model() + except ModellerError: + print(f'Failed to build model for chain {mc}') + print(f'No loops detected in {out_fasta_file.name}') + print(f'may pdb file sequence is not correct') + continue + except Exception as e: + raise e + print(f'Model files: {[file.name for file in modeller_results]}') + # change id to original + for i in modeller_results: + manalyzer = PDBAnalyzer(i) + manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) + if len(modeller_results) == 1: + # use pymol to align + aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb')) + pdbstr = aligner.align() + mc_dict[mc] = pdbstr + else: + print('more than one model file, please set num_loop to 1') + elif len(mc_fasta) == 0: + continue + else: + raise ValueError(f'only can fix one chain content: {mc_fasta}') + + # 使用示例 + split_dict.update(mc_dict) # 更新 split_dict + import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb') + if __name__ == "__main__": # import argparse # parser = argparse.ArgumentParser(description="Build model by Modeller") @@ -380,71 +492,7 @@ if __name__ == "__main__": # parser.add_argument("-m", "--md_level", help="MD level") # parser.add_argument("-c", "--chain", help="Chain ID") # args = parser.parse_args() - pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')] - for i in pdbfiles: - PDB_file_path = i - PDB_ID = i.stem - analyzer = PDBAnalyzer(PDB_file_path) - sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 '' - print(f'Residues info for {PDB_ID}: \n',sequences) - missing_info = analyzer.extract_sequences_info() - print(f'Missing residues info for {PDB_ID}:\n {missing_info}') - # 示例: 使用biopython提取A链(将会保留HETATM) - chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' - chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 - # 示例: 使用biopandas提取A链(将不会保留HETATM) - chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' - chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 - # A链改B链, 并分割保存为单独文件 - analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') - # 分割所有的链 - split_dict = {} - for j in analyzer.chain_id_list: - fn = Path(f'{PDB_ID}_{j}.pdb') - analyzer.split_chain(j).to_pdb(fn.as_posix()) - split_dict[j]=fn.read_text() - # 修复loop区域 - from build_modeller import PDBModeler - from modeller import ModellerError - mc_dict = {} - for mc in missing_info: - out_file = f'5sws_{mc}.pdb' - analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file - mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file - if len(mc_fasta) == 1: - mc_fasta = mc_fasta[0] - out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') - analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) - print(f'>{mc_fasta.description}') - print(mc_fasta.seq) - modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast') - try: - modeller_results = modeller.make_model() - except ModellerError: - print(f'Failed to build model for chain {mc}') - print(f'No loops detected in {out_fasta_file.name}') - print(f'may pdb file sequence is not correct') - continue - except Exception as e: - raise e - print(f'Model files: {[file.name for file in modeller_results]}') - # change id to original - for i in modeller_results: - manalyzer = PDBAnalyzer(i) - manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) - if len(modeller_results) == 1: - # use pymol to align - aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb')) - pdbstr = aligner.align() - mc_dict[mc] = pdbstr - else: - print('more than one model file, please set num_loop to 1') - elif len(mc_fasta) == 0: - continue - else: - raise ValueError(f'only can fix one chain content: {mc_fasta}') + # fix_all(Path('./pdb_test1')) + pass - # 使用示例 - split_dict.update(mc_dict) # 更新 split_dict - import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')