update
This commit is contained in:
111
analysis_pdb.py
111
analysis_pdb.py
@@ -20,6 +20,7 @@ from biopandas.pdb import PandasPdb
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from Bio.SeqRecord import SeqRecord
|
from Bio.SeqRecord import SeqRecord
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
|
from Bio.Align import PairwiseAligner
|
||||||
import requests
|
import requests
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from pymol import cmd
|
from pymol import cmd
|
||||||
@@ -285,7 +286,46 @@ class PDBAnalyzer:
|
|||||||
raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
|
raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
|
||||||
|
|
||||||
def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
|
def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
|
||||||
|
"""
|
||||||
|
Filter sequences from a FASTA file based on a specific chain ID.
|
||||||
|
|
||||||
|
This function is designed to work with FASTA files containing single polypeptide chains (monomers).
|
||||||
|
It filters the sequences by matching the specified chain ID with the descriptions in the FASTA file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chain_id (str): The chain ID to be used for filtering the sequences.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[SeqRecord]: A list of SeqRecord objects corresponding to the specified chain ID.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
This method assumes that the FASTA file contains sequences of individual chains (monomers) only.
|
||||||
|
It may not work correctly if the FASTA file contains sequences from multimeric proteins (with multiple chains combined).
|
||||||
|
"""
|
||||||
return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
|
return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
|
||||||
|
|
||||||
|
|
||||||
|
def find_most_similar(self, input_seq: str) -> str:
|
||||||
|
"""
|
||||||
|
Find the most similar sequence in the FASTA file to the given input sequence.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_seq (str): The protein sequence to compare against sequences in the FASTA file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The most similar sequence found in the FASTA file.
|
||||||
|
"""
|
||||||
|
aligner = PairwiseAligner()
|
||||||
|
max_score = -1
|
||||||
|
most_similar_seq = None
|
||||||
|
|
||||||
|
for record in self.read_fasta():
|
||||||
|
score = aligner.score(input_seq, str(record.seq))
|
||||||
|
if score > max_score:
|
||||||
|
max_score = score
|
||||||
|
most_similar_seq = record.seq
|
||||||
|
|
||||||
|
return most_similar_seq
|
||||||
|
|
||||||
def read_fasta(self) -> SeqRecord:
|
def read_fasta(self) -> SeqRecord:
|
||||||
fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
|
fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
|
||||||
@@ -348,7 +388,7 @@ def main(PDB_ID, PDB_file_path):
|
|||||||
for mc in missing_info:
|
for mc in missing_info:
|
||||||
out_file = f'5sws_{mc}.pdb'
|
out_file = f'5sws_{mc}.pdb'
|
||||||
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
|
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
|
||||||
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
|
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file # single polypeptide chains (monomers).
|
||||||
if len(mc_fasta) == 1:
|
if len(mc_fasta) == 1:
|
||||||
mc_fasta = mc_fasta[0]
|
mc_fasta = mc_fasta[0]
|
||||||
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
|
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
|
||||||
@@ -424,75 +464,6 @@ for j in analyzer.chain_id_list:
|
|||||||
split_dict[j]=fn.read_text()
|
split_dict[j]=fn.read_text()
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def fix_all(path:Path):
|
|
||||||
pdbfiles = [i for i in path.glob('*.pdb')]
|
|
||||||
for i in pdbfiles:
|
|
||||||
PDB_file_path = i
|
|
||||||
PDB_ID = i.stem
|
|
||||||
analyzer = PDBAnalyzer(PDB_file_path)
|
|
||||||
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
|
|
||||||
print(f'Residues info for {PDB_ID}: \n',sequences)
|
|
||||||
missing_info = analyzer.extract_sequences_info()
|
|
||||||
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
|
|
||||||
# 示例: 使用biopython提取A链(将会保留HETATM)
|
|
||||||
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
|
|
||||||
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
|
|
||||||
# 示例: 使用biopandas提取A链(将不会保留HETATM)
|
|
||||||
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
|
|
||||||
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
|
|
||||||
# A链改B链, 并分割保存为单独文件
|
|
||||||
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
|
|
||||||
# 分割所有的链
|
|
||||||
split_dict = {}
|
|
||||||
for j in analyzer.chain_id_list:
|
|
||||||
fn = Path(f'{PDB_ID}_{j}.pdb')
|
|
||||||
analyzer.split_chain(j).to_pdb(fn.as_posix())
|
|
||||||
split_dict[j]=fn.read_text()
|
|
||||||
# 修复loop区域
|
|
||||||
from build_modeller import PDBModeler
|
|
||||||
from modeller import ModellerError
|
|
||||||
mc_dict = {}
|
|
||||||
for mc in missing_info:
|
|
||||||
out_file = f'5sws_{mc}.pdb'
|
|
||||||
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
|
|
||||||
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
|
|
||||||
if len(mc_fasta) == 1:
|
|
||||||
mc_fasta = mc_fasta[0]
|
|
||||||
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
|
|
||||||
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
|
|
||||||
print(f'>{mc_fasta.description}')
|
|
||||||
print(mc_fasta.seq)
|
|
||||||
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
|
|
||||||
try:
|
|
||||||
modeller_results = modeller.make_model()
|
|
||||||
except ModellerError:
|
|
||||||
print(f'Failed to build model for chain {mc}')
|
|
||||||
print(f'No loops detected in {out_fasta_file.name}')
|
|
||||||
print(f'may pdb file sequence is not correct')
|
|
||||||
continue
|
|
||||||
except Exception as e:
|
|
||||||
raise e
|
|
||||||
print(f'Model files: {[file.name for file in modeller_results]}')
|
|
||||||
# change id to original
|
|
||||||
for i in modeller_results:
|
|
||||||
manalyzer = PDBAnalyzer(i)
|
|
||||||
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
|
|
||||||
if len(modeller_results) == 1:
|
|
||||||
# use pymol to align
|
|
||||||
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
|
|
||||||
pdbstr = aligner.align()
|
|
||||||
mc_dict[mc] = pdbstr
|
|
||||||
else:
|
|
||||||
print('more than one model file, please set num_loop to 1')
|
|
||||||
elif len(mc_fasta) == 0:
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
raise ValueError(f'only can fix one chain content: {mc_fasta}')
|
|
||||||
|
|
||||||
# 使用示例
|
|
||||||
split_dict.update(mc_dict) # 更新 split_dict
|
|
||||||
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# import argparse
|
# import argparse
|
||||||
# parser = argparse.ArgumentParser(description="Build model by Modeller")
|
# parser = argparse.ArgumentParser(description="Build model by Modeller")
|
||||||
|
|||||||
Reference in New Issue
Block a user