This commit is contained in:
root
2024-01-16 15:38:51 +08:00
parent 66062bead7
commit bcf5445f04

View File

@@ -20,6 +20,7 @@ from biopandas.pdb import PandasPdb
from pathlib import Path
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Align import PairwiseAligner
import requests
from copy import deepcopy
from pymol import cmd
@@ -285,7 +286,46 @@ class PDBAnalyzer:
raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
"""
Filter sequences from a FASTA file based on a specific chain ID.
This function is designed to work with FASTA files containing single polypeptide chains (monomers).
It filters the sequences by matching the specified chain ID with the descriptions in the FASTA file.
Args:
chain_id (str): The chain ID to be used for filtering the sequences.
Returns:
List[SeqRecord]: A list of SeqRecord objects corresponding to the specified chain ID.
Note:
This method assumes that the FASTA file contains sequences of individual chains (monomers) only.
It may not work correctly if the FASTA file contains sequences from multimeric proteins (with multiple chains combined).
"""
return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
def find_most_similar(self, input_seq: str) -> str:
"""
Find the most similar sequence in the FASTA file to the given input sequence.
Args:
input_seq (str): The protein sequence to compare against sequences in the FASTA file.
Returns:
str: The most similar sequence found in the FASTA file.
"""
aligner = PairwiseAligner()
max_score = -1
most_similar_seq = None
for record in self.read_fasta():
score = aligner.score(input_seq, str(record.seq))
if score > max_score:
max_score = score
most_similar_seq = record.seq
return most_similar_seq
def read_fasta(self) -> SeqRecord:
fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
@@ -348,7 +388,7 @@ def main(PDB_ID, PDB_file_path):
for mc in missing_info:
out_file = f'5sws_{mc}.pdb'
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file # single polypeptide chains (monomers).
if len(mc_fasta) == 1:
mc_fasta = mc_fasta[0]
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
@@ -424,75 +464,6 @@ for j in analyzer.chain_id_list:
split_dict[j]=fn.read_text()
'''
def fix_all(path:Path):
pdbfiles = [i for i in path.glob('*.pdb')]
for i in pdbfiles:
PDB_file_path = i
PDB_ID = i.stem
analyzer = PDBAnalyzer(PDB_file_path)
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
print(f'Residues info for {PDB_ID}: \n',sequences)
missing_info = analyzer.extract_sequences_info()
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
# 示例: 使用biopython提取A链将会保留HETATM
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
# 示例: 使用biopandas提取A链将不会保留HETATM
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
# A链改B链, 并分割保存为单独文件
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
# 分割所有的链
split_dict = {}
for j in analyzer.chain_id_list:
fn = Path(f'{PDB_ID}_{j}.pdb')
analyzer.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
# 修复loop区域
from build_modeller import PDBModeler
from modeller import ModellerError
mc_dict = {}
for mc in missing_info:
out_file = f'5sws_{mc}.pdb'
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
if len(mc_fasta) == 1:
mc_fasta = mc_fasta[0]
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
print(f'>{mc_fasta.description}')
print(mc_fasta.seq)
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
try:
modeller_results = modeller.make_model()
except ModellerError:
print(f'Failed to build model for chain {mc}')
print(f'No loops detected in {out_fasta_file.name}')
print(f'may pdb file sequence is not correct')
continue
except Exception as e:
raise e
print(f'Model files: {[file.name for file in modeller_results]}')
# change id to original
for i in modeller_results:
manalyzer = PDBAnalyzer(i)
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
if len(modeller_results) == 1:
# use pymol to align
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
pdbstr = aligner.align()
mc_dict[mc] = pdbstr
else:
print('more than one model file, please set num_loop to 1')
elif len(mc_fasta) == 0:
continue
else:
raise ValueError(f'only can fix one chain content: {mc_fasta}')
# 使用示例
split_dict.update(mc_dict) # 更新 split_dict
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
if __name__ == "__main__":
# import argparse
# parser = argparse.ArgumentParser(description="Build model by Modeller")