add cleanATOM
This commit is contained in:
182
analysis_pdb.py
182
analysis_pdb.py
@@ -9,7 +9,7 @@
|
||||
@version :1.0
|
||||
'''
|
||||
# micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab
|
||||
# modeller注册码:MODELIRANJE
|
||||
# modeller注册码:MODELIRANJE (<conda_env>//lib/modeller-10.4/modlib/modeller/config.py)
|
||||
from dataclasses import dataclass, field
|
||||
from Bio.PDB import PDBParser
|
||||
from Bio.SeqUtils import seq1
|
||||
@@ -23,6 +23,7 @@ from Bio import SeqIO
|
||||
import requests
|
||||
from copy import deepcopy
|
||||
from pymol import cmd
|
||||
import os
|
||||
|
||||
@dataclass
|
||||
class PDBAnalyzer:
|
||||
@@ -51,6 +52,31 @@ class PDBAnalyzer:
|
||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||
|
||||
def cleanATOM(self, out_file=None, ext="_clean.pdb") -> Path: # from pyrosetta.toolbox import cleanATOM
|
||||
"""Extract all ATOM and TER records in a PDB file and write them to a new file.
|
||||
|
||||
Args:
|
||||
pdb_file (str): Path of the PDB file from which ATOM and TER records
|
||||
will be extracted
|
||||
out_file (str): Optional argument to specify a particular output filename.
|
||||
Defaults to <pdb_file>.clean.pdb.
|
||||
ext (str): File extension to use for output file. Defaults to ".clean.pdb"
|
||||
"""
|
||||
pdb_file = self.path.as_posix()
|
||||
# find all ATOM and TER lines
|
||||
with open(pdb_file, "r") as fid:
|
||||
good = [l for l in fid if l.startswith(("ATOM", "TER"))]
|
||||
|
||||
# default output file to <pdb_file>_clean.pdb
|
||||
if out_file is None:
|
||||
out_file = os.path.splitext(pdb_file)[0] + ext
|
||||
|
||||
# write the selected records to a new file
|
||||
with open(out_file, "w") as fid:
|
||||
fid.writelines(good)
|
||||
return Path(out_file)
|
||||
|
||||
|
||||
def check_continuity(self, chain, missing_char):
|
||||
"""
|
||||
Check the continuity of residues in a protein chain.
|
||||
@@ -370,6 +396,92 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
|
||||
# 保存合并后的对象
|
||||
cmd.save(output_file, merged_object_name)
|
||||
|
||||
'''
|
||||
# 示例: 使用biopython提取A链(将会保留HETATM)
|
||||
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# 示例: 使用biopandas提取A链(将不会保留HETATM)
|
||||
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# A链改B链, 并分割保存为单独文件
|
||||
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
|
||||
# 分割所有的链
|
||||
split_dict = {}
|
||||
for j in analyzer.chain_id_list:
|
||||
fn = Path(f'{PDB_ID}_{j}.pdb')
|
||||
analyzer.split_chain(j).to_pdb(fn.as_posix())
|
||||
split_dict[j]=fn.read_text()
|
||||
'''
|
||||
|
||||
def fix_all(path:Path):
|
||||
pdbfiles = [i for i in path.glob('*.pdb')]
|
||||
for i in pdbfiles:
|
||||
PDB_file_path = i
|
||||
PDB_ID = i.stem
|
||||
analyzer = PDBAnalyzer(PDB_file_path)
|
||||
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
|
||||
print(f'Residues info for {PDB_ID}: \n',sequences)
|
||||
missing_info = analyzer.extract_sequences_info()
|
||||
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
|
||||
# 示例: 使用biopython提取A链(将会保留HETATM)
|
||||
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# 示例: 使用biopandas提取A链(将不会保留HETATM)
|
||||
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# A链改B链, 并分割保存为单独文件
|
||||
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
|
||||
# 分割所有的链
|
||||
split_dict = {}
|
||||
for j in analyzer.chain_id_list:
|
||||
fn = Path(f'{PDB_ID}_{j}.pdb')
|
||||
analyzer.split_chain(j).to_pdb(fn.as_posix())
|
||||
split_dict[j]=fn.read_text()
|
||||
# 修复loop区域
|
||||
from build_modeller import PDBModeler
|
||||
from modeller import ModellerError
|
||||
mc_dict = {}
|
||||
for mc in missing_info:
|
||||
out_file = f'5sws_{mc}.pdb'
|
||||
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
|
||||
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
|
||||
if len(mc_fasta) == 1:
|
||||
mc_fasta = mc_fasta[0]
|
||||
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
|
||||
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
|
||||
print(f'>{mc_fasta.description}')
|
||||
print(mc_fasta.seq)
|
||||
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
|
||||
try:
|
||||
modeller_results = modeller.make_model()
|
||||
except ModellerError:
|
||||
print(f'Failed to build model for chain {mc}')
|
||||
print(f'No loops detected in {out_fasta_file.name}')
|
||||
print(f'may pdb file sequence is not correct')
|
||||
continue
|
||||
except Exception as e:
|
||||
raise e
|
||||
print(f'Model files: {[file.name for file in modeller_results]}')
|
||||
# change id to original
|
||||
for i in modeller_results:
|
||||
manalyzer = PDBAnalyzer(i)
|
||||
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
|
||||
if len(modeller_results) == 1:
|
||||
# use pymol to align
|
||||
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
|
||||
pdbstr = aligner.align()
|
||||
mc_dict[mc] = pdbstr
|
||||
else:
|
||||
print('more than one model file, please set num_loop to 1')
|
||||
elif len(mc_fasta) == 0:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(f'only can fix one chain content: {mc_fasta}')
|
||||
|
||||
# 使用示例
|
||||
split_dict.update(mc_dict) # 更新 split_dict
|
||||
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
|
||||
|
||||
if __name__ == "__main__":
|
||||
# import argparse
|
||||
# parser = argparse.ArgumentParser(description="Build model by Modeller")
|
||||
@@ -380,71 +492,7 @@ if __name__ == "__main__":
|
||||
# parser.add_argument("-m", "--md_level", help="MD level")
|
||||
# parser.add_argument("-c", "--chain", help="Chain ID")
|
||||
# args = parser.parse_args()
|
||||
pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')]
|
||||
for i in pdbfiles:
|
||||
PDB_file_path = i
|
||||
PDB_ID = i.stem
|
||||
analyzer = PDBAnalyzer(PDB_file_path)
|
||||
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
|
||||
print(f'Residues info for {PDB_ID}: \n',sequences)
|
||||
missing_info = analyzer.extract_sequences_info()
|
||||
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
|
||||
# 示例: 使用biopython提取A链(将会保留HETATM)
|
||||
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# 示例: 使用biopandas提取A链(将不会保留HETATM)
|
||||
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# A链改B链, 并分割保存为单独文件
|
||||
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
|
||||
# 分割所有的链
|
||||
split_dict = {}
|
||||
for j in analyzer.chain_id_list:
|
||||
fn = Path(f'{PDB_ID}_{j}.pdb')
|
||||
analyzer.split_chain(j).to_pdb(fn.as_posix())
|
||||
split_dict[j]=fn.read_text()
|
||||
# 修复loop区域
|
||||
from build_modeller import PDBModeler
|
||||
from modeller import ModellerError
|
||||
mc_dict = {}
|
||||
for mc in missing_info:
|
||||
out_file = f'5sws_{mc}.pdb'
|
||||
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
|
||||
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
|
||||
if len(mc_fasta) == 1:
|
||||
mc_fasta = mc_fasta[0]
|
||||
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
|
||||
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
|
||||
print(f'>{mc_fasta.description}')
|
||||
print(mc_fasta.seq)
|
||||
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
|
||||
try:
|
||||
modeller_results = modeller.make_model()
|
||||
except ModellerError:
|
||||
print(f'Failed to build model for chain {mc}')
|
||||
print(f'No loops detected in {out_fasta_file.name}')
|
||||
print(f'may pdb file sequence is not correct')
|
||||
continue
|
||||
except Exception as e:
|
||||
raise e
|
||||
print(f'Model files: {[file.name for file in modeller_results]}')
|
||||
# change id to original
|
||||
for i in modeller_results:
|
||||
manalyzer = PDBAnalyzer(i)
|
||||
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
|
||||
if len(modeller_results) == 1:
|
||||
# use pymol to align
|
||||
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
|
||||
pdbstr = aligner.align()
|
||||
mc_dict[mc] = pdbstr
|
||||
else:
|
||||
print('more than one model file, please set num_loop to 1')
|
||||
elif len(mc_fasta) == 0:
|
||||
continue
|
||||
else:
|
||||
raise ValueError(f'only can fix one chain content: {mc_fasta}')
|
||||
# fix_all(Path('./pdb_test1'))
|
||||
pass
|
||||
|
||||
# 使用示例
|
||||
split_dict.update(mc_dict) # 更新 split_dict
|
||||
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
|
||||
|
||||
|
||||
Reference in New Issue
Block a user