add ca_distances_median

This commit is contained in:
2024-03-07 16:28:20 +08:00
parent eca3f45bd9
commit b66761ad9b

View File

@@ -32,6 +32,7 @@ from pymol import CmdException
import pymol
import logging
from logging.handlers import RotatingFileHandler
import numpy as np
# 使用 BioPython 导入氨基酸缩写
AMINO_ACIDS = set(IUPACData.protein_letters)
@@ -104,6 +105,7 @@ class PDBAnalyzer:
chain_id_list: List[str] = field(init=False)
log_file: Path = field(init=False)
logger: logging.Logger = field(init=False)
ca_distances: List[float] = field(init=False)
def __post_init__(self):
"""
@@ -126,6 +128,7 @@ class PDBAnalyzer:
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
self.ca_distances = []
def _configure_logging(self):
@@ -155,6 +158,29 @@ class PDBAnalyzer:
self.logger.info("Structure and properties initialized.")
def calculate_distance(self, chain_id, res1, res2):
chain = self.structure[0][chain_id] # 假设只有一个模型
try:
res1_ca = chain[res1]['CA']
res2_ca = chain[res2]['CA']
distance = res1_ca - res2_ca
return distance
except KeyError:
# 如果没有CA原子返回一个很大的值表示距离无穷大
return float('inf')
def judge_missing_or_error(self,structure, chain_id, detailed_seq):
# 这里的detailed_seq是从extract_sequences(detailed=True)获取的
for i in range(len(detailed_seq)-1):
current_res, next_res = detailed_seq[i][0], detailed_seq[i+1][0]
distance = self.calculate_distance(structure, chain_id, current_res, next_res)
# 根据距离判断和逻辑处理
def renumber_based_on_judgement(self, chain_id, detailed_seq):
# 实现基于前面的判断逻辑来重新编号的代码
...
@classmethod
def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer':
"""
@@ -185,26 +211,48 @@ class PDBAnalyzer:
def check_and_log_sequence_issues(self):
"""
检测并记录每条链的编号问题。
检测并记录每条链的编号问题,并计算相邻残基间的距离
"""
for chain_id, detailed_seq in self.extract_sequences(detailed=True).items():
# 检测起始编号是否为1
if detailed_seq[0][0] != 1:
self.logger.warning(f"Chain {chain_id} does not start with residue number 1.")
# 检测编号连续性和错误
for i in range(len(detailed_seq)-1):
diff = detailed_seq[i+1][0] - detailed_seq[i][0]
if diff > 10:
self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
elif diff > 1:
self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False):
modifications = self.find_issues_and_generate_modifications(chains)
self.batch_modify_residues(modifications)
if return_pandas:
return self.biodata
# 检测编号连续性和错误,同时计算距离
for i in range(len(detailed_seq) - 1):
diff = detailed_seq[i + 1][0] - detailed_seq[i][0]
# if diff > 1:
# 计算两个相邻残基的Cα原子之间的距离
distance = self.calculate_distance(chain_id, detailed_seq[i][0], detailed_seq[i + 1][0])
# 如果编号相差很大但Cα原子距离不符合预期既不符合正常距离也不符合预期的缺失距离
if diff > 10 or (distance < 3 or distance > 5 * diff):
self.logger.warning(f"Potential numbering error or unexpected distance in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Distance: {distance:.2f} Å")
elif 3.3 <= distance <= 4.8 * diff:
# 距离在合理范围内,可能是正常的残基缺失
self.ca_distances.append(distance)
# self.logger.info(f"Missing residues detected in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Estimated missing residues based on distance: {round(distance / 5)}. Distance: {distance:.2f} Å")
@classmethod
def renumber_residues_based_on_issues_and_clean(cls, input_file: Path, out_ext: str = ".renumbered.pdb", chains: Union[List[str], str, None] = None) -> 'PDBAnalyzer':
"""
先清洗PDB文件再重新编号并返回处理后的cls实例。
Args:
input_file (Path): PDB文件路径。
out_ext (str): 重新编号后的输出文件扩展名,默认为".renumbered.pdb"
chains (Union[List[str], str, None]): 可选;指定要重新编号的链。
Returns:
PDBAnalyzer: 经过清洗和重新编号的PDBAnalyzer实例。
"""
cleaned_analyzer = cls.cleanATOM(input_file)
modifications = cleaned_analyzer.find_issues_and_generate_modifications(chains)
cleaned_analyzer.batch_modify_residues(modifications)
output_file = input_file.with_suffix(out_ext)
cleaned_analyzer.save_biodata(output_file) # 假设这个方法可以将修改后的数据写入文件
# 返回一个新的PDBAnalyzer实例指向重新编号后的文件
return cls(output_file)
def find_issues_and_generate_modifications(self, specified_chains=None) -> dict:
"""
@@ -549,6 +597,19 @@ class PDBAnalyzer:
if not fasta_file.exists():
self.download_fasta()
return SeqIO.parse(fasta_file.as_posix(), "fasta")
@property
def ca_distances_median(self):
"""
计算整个蛋白质中所有氨基酸Cα原子之间距离的中位数。
Returns:
float: 所有Cα原子间距离的中位数。
"""
if self.ca_distances:
return np.median(self.ca_distances)
else:
return None
@dataclass
class PDBAlign:
@@ -664,34 +725,7 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
# 保存合并后的对象
cmd.save(output_file, merged_object_name)
'''
# 示例: 使用biopython提取A链将会保留HETATM
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
# 示例: 使用biopandas提取A链将不会保留HETATM
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
# A链改B链, 并分割保存为单独文件
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
# 分割所有的链
split_dict = {}
for j in analyzer.chain_id_list:
fn = Path(f'{PDB_ID}_{j}.pdb')
analyzer.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
'''
if __name__ == "__main__":
# import argparse
# parser = argparse.ArgumentParser(description="Build model by Modeller")
# parser.add_argument("-s", "--structure", help="Structure file")
# parser.add_argument("-o", "--outdir", help="Output directory")
# parser.add_argument("-f", "--fasta", help="Fasta file")
# parser.add_argument("-n", "--num_loop", help="Number of loop model")
# parser.add_argument("-m", "--md_level", help="MD level")
# parser.add_argument("-c", "--chain", help="Chain ID")
# args = parser.parse_args()
# fix_all(Path('./pdb_test1'))
pass