add ca_distances_median
This commit is contained in:
116
analysis_pdb.py
116
analysis_pdb.py
@@ -32,6 +32,7 @@ from pymol import CmdException
|
||||
import pymol
|
||||
import logging
|
||||
from logging.handlers import RotatingFileHandler
|
||||
import numpy as np
|
||||
# 使用 BioPython 导入氨基酸缩写
|
||||
AMINO_ACIDS = set(IUPACData.protein_letters)
|
||||
|
||||
@@ -104,6 +105,7 @@ class PDBAnalyzer:
|
||||
chain_id_list: List[str] = field(init=False)
|
||||
log_file: Path = field(init=False)
|
||||
logger: logging.Logger = field(init=False)
|
||||
ca_distances: List[float] = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
@@ -126,6 +128,7 @@ class PDBAnalyzer:
|
||||
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
|
||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||
self.ca_distances = []
|
||||
|
||||
|
||||
def _configure_logging(self):
|
||||
@@ -155,6 +158,29 @@ class PDBAnalyzer:
|
||||
|
||||
self.logger.info("Structure and properties initialized.")
|
||||
|
||||
def calculate_distance(self, chain_id, res1, res2):
|
||||
chain = self.structure[0][chain_id] # 假设只有一个模型
|
||||
try:
|
||||
res1_ca = chain[res1]['CA']
|
||||
res2_ca = chain[res2]['CA']
|
||||
distance = res1_ca - res2_ca
|
||||
return distance
|
||||
except KeyError:
|
||||
# 如果没有CA原子,返回一个很大的值表示距离无穷大
|
||||
return float('inf')
|
||||
|
||||
def judge_missing_or_error(self,structure, chain_id, detailed_seq):
|
||||
# 这里的detailed_seq是从extract_sequences(detailed=True)获取的
|
||||
for i in range(len(detailed_seq)-1):
|
||||
current_res, next_res = detailed_seq[i][0], detailed_seq[i+1][0]
|
||||
distance = self.calculate_distance(structure, chain_id, current_res, next_res)
|
||||
# 根据距离判断和逻辑处理
|
||||
|
||||
def renumber_based_on_judgement(self, chain_id, detailed_seq):
|
||||
# 实现基于前面的判断逻辑来重新编号的代码
|
||||
...
|
||||
|
||||
|
||||
@classmethod
|
||||
def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer':
|
||||
"""
|
||||
@@ -185,26 +211,48 @@ class PDBAnalyzer:
|
||||
|
||||
def check_and_log_sequence_issues(self):
|
||||
"""
|
||||
检测并记录每条链的编号问题。
|
||||
检测并记录每条链的编号问题,并计算相邻残基间的距离。
|
||||
"""
|
||||
for chain_id, detailed_seq in self.extract_sequences(detailed=True).items():
|
||||
# 检测起始编号是否为1
|
||||
if detailed_seq[0][0] != 1:
|
||||
self.logger.warning(f"Chain {chain_id} does not start with residue number 1.")
|
||||
|
||||
# 检测编号连续性和错误
|
||||
for i in range(len(detailed_seq)-1):
|
||||
diff = detailed_seq[i+1][0] - detailed_seq[i][0]
|
||||
if diff > 10:
|
||||
self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
|
||||
elif diff > 1:
|
||||
self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
|
||||
|
||||
def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False):
|
||||
modifications = self.find_issues_and_generate_modifications(chains)
|
||||
self.batch_modify_residues(modifications)
|
||||
if return_pandas:
|
||||
return self.biodata
|
||||
# 检测编号连续性和错误,同时计算距离
|
||||
for i in range(len(detailed_seq) - 1):
|
||||
diff = detailed_seq[i + 1][0] - detailed_seq[i][0]
|
||||
# if diff > 1:
|
||||
# 计算两个相邻残基的Cα原子之间的距离
|
||||
distance = self.calculate_distance(chain_id, detailed_seq[i][0], detailed_seq[i + 1][0])
|
||||
# 如果编号相差很大,但Cα原子距离不符合预期(既不符合正常距离,也不符合预期的缺失距离)
|
||||
if diff > 10 or (distance < 3 or distance > 5 * diff):
|
||||
self.logger.warning(f"Potential numbering error or unexpected distance in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Distance: {distance:.2f} Å")
|
||||
elif 3.3 <= distance <= 4.8 * diff:
|
||||
# 距离在合理范围内,可能是正常的残基缺失
|
||||
self.ca_distances.append(distance)
|
||||
# self.logger.info(f"Missing residues detected in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Estimated missing residues based on distance: {round(distance / 5)}. Distance: {distance:.2f} Å")
|
||||
|
||||
@classmethod
|
||||
def renumber_residues_based_on_issues_and_clean(cls, input_file: Path, out_ext: str = ".renumbered.pdb", chains: Union[List[str], str, None] = None) -> 'PDBAnalyzer':
|
||||
"""
|
||||
先清洗PDB文件,再重新编号,并返回处理后的cls实例。
|
||||
|
||||
Args:
|
||||
input_file (Path): PDB文件路径。
|
||||
out_ext (str): 重新编号后的输出文件扩展名,默认为".renumbered.pdb"。
|
||||
chains (Union[List[str], str, None]): 可选;指定要重新编号的链。
|
||||
|
||||
Returns:
|
||||
PDBAnalyzer: 经过清洗和重新编号的PDBAnalyzer实例。
|
||||
"""
|
||||
cleaned_analyzer = cls.cleanATOM(input_file)
|
||||
modifications = cleaned_analyzer.find_issues_and_generate_modifications(chains)
|
||||
cleaned_analyzer.batch_modify_residues(modifications)
|
||||
output_file = input_file.with_suffix(out_ext)
|
||||
cleaned_analyzer.save_biodata(output_file) # 假设这个方法可以将修改后的数据写入文件
|
||||
|
||||
# 返回一个新的PDBAnalyzer实例,指向重新编号后的文件
|
||||
return cls(output_file)
|
||||
|
||||
def find_issues_and_generate_modifications(self, specified_chains=None) -> dict:
|
||||
"""
|
||||
@@ -549,6 +597,19 @@ class PDBAnalyzer:
|
||||
if not fasta_file.exists():
|
||||
self.download_fasta()
|
||||
return SeqIO.parse(fasta_file.as_posix(), "fasta")
|
||||
|
||||
@property
|
||||
def ca_distances_median(self):
|
||||
"""
|
||||
计算整个蛋白质中所有氨基酸Cα原子之间距离的中位数。
|
||||
|
||||
Returns:
|
||||
float: 所有Cα原子间距离的中位数。
|
||||
"""
|
||||
if self.ca_distances:
|
||||
return np.median(self.ca_distances)
|
||||
else:
|
||||
return None
|
||||
|
||||
@dataclass
|
||||
class PDBAlign:
|
||||
@@ -664,34 +725,7 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
|
||||
# 保存合并后的对象
|
||||
cmd.save(output_file, merged_object_name)
|
||||
|
||||
'''
|
||||
# 示例: 使用biopython提取A链(将会保留HETATM)
|
||||
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# 示例: 使用biopandas提取A链(将不会保留HETATM)
|
||||
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
|
||||
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
|
||||
# A链改B链, 并分割保存为单独文件
|
||||
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
|
||||
# 分割所有的链
|
||||
split_dict = {}
|
||||
for j in analyzer.chain_id_list:
|
||||
fn = Path(f'{PDB_ID}_{j}.pdb')
|
||||
analyzer.split_chain(j).to_pdb(fn.as_posix())
|
||||
split_dict[j]=fn.read_text()
|
||||
'''
|
||||
|
||||
if __name__ == "__main__":
|
||||
# import argparse
|
||||
# parser = argparse.ArgumentParser(description="Build model by Modeller")
|
||||
# parser.add_argument("-s", "--structure", help="Structure file")
|
||||
# parser.add_argument("-o", "--outdir", help="Output directory")
|
||||
# parser.add_argument("-f", "--fasta", help="Fasta file")
|
||||
# parser.add_argument("-n", "--num_loop", help="Number of loop model")
|
||||
# parser.add_argument("-m", "--md_level", help="MD level")
|
||||
# parser.add_argument("-c", "--chain", help="Chain ID")
|
||||
# args = parser.parse_args()
|
||||
# fix_all(Path('./pdb_test1'))
|
||||
pass
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user