diff --git a/analysis_pdb.py b/analysis_pdb.py index 382dbe6..854f06b 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -32,6 +32,7 @@ from pymol import CmdException import pymol import logging from logging.handlers import RotatingFileHandler +import numpy as np # 使用 BioPython 导入氨基酸缩写 AMINO_ACIDS = set(IUPACData.protein_letters) @@ -104,6 +105,7 @@ class PDBAnalyzer: chain_id_list: List[str] = field(init=False) log_file: Path = field(init=False) logger: logging.Logger = field(init=False) + ca_distances: List[float] = field(init=False) def __post_init__(self): """ @@ -126,6 +128,7 @@ class PDBAnalyzer: self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix()) self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + self.ca_distances = [] def _configure_logging(self): @@ -155,6 +158,29 @@ class PDBAnalyzer: self.logger.info("Structure and properties initialized.") + def calculate_distance(self, chain_id, res1, res2): + chain = self.structure[0][chain_id] # 假设只有一个模型 + try: + res1_ca = chain[res1]['CA'] + res2_ca = chain[res2]['CA'] + distance = res1_ca - res2_ca + return distance + except KeyError: + # 如果没有CA原子,返回一个很大的值表示距离无穷大 + return float('inf') + + def judge_missing_or_error(self,structure, chain_id, detailed_seq): + # 这里的detailed_seq是从extract_sequences(detailed=True)获取的 + for i in range(len(detailed_seq)-1): + current_res, next_res = detailed_seq[i][0], detailed_seq[i+1][0] + distance = self.calculate_distance(structure, chain_id, current_res, next_res) + # 根据距离判断和逻辑处理 + + def renumber_based_on_judgement(self, chain_id, detailed_seq): + # 实现基于前面的判断逻辑来重新编号的代码 + ... + + @classmethod def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer': """ @@ -185,26 +211,48 @@ class PDBAnalyzer: def check_and_log_sequence_issues(self): """ - 检测并记录每条链的编号问题。 + 检测并记录每条链的编号问题,并计算相邻残基间的距离。 """ for chain_id, detailed_seq in self.extract_sequences(detailed=True).items(): # 检测起始编号是否为1 if detailed_seq[0][0] != 1: self.logger.warning(f"Chain {chain_id} does not start with residue number 1.") - # 检测编号连续性和错误 - for i in range(len(detailed_seq)-1): - diff = detailed_seq[i+1][0] - detailed_seq[i][0] - if diff > 10: - self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.") - elif diff > 1: - self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.") - - def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False): - modifications = self.find_issues_and_generate_modifications(chains) - self.batch_modify_residues(modifications) - if return_pandas: - return self.biodata + # 检测编号连续性和错误,同时计算距离 + for i in range(len(detailed_seq) - 1): + diff = detailed_seq[i + 1][0] - detailed_seq[i][0] + # if diff > 1: + # 计算两个相邻残基的Cα原子之间的距离 + distance = self.calculate_distance(chain_id, detailed_seq[i][0], detailed_seq[i + 1][0]) + # 如果编号相差很大,但Cα原子距离不符合预期(既不符合正常距离,也不符合预期的缺失距离) + if diff > 10 or (distance < 3 or distance > 5 * diff): + self.logger.warning(f"Potential numbering error or unexpected distance in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Distance: {distance:.2f} Å") + elif 3.3 <= distance <= 4.8 * diff: + # 距离在合理范围内,可能是正常的残基缺失 + self.ca_distances.append(distance) + # self.logger.info(f"Missing residues detected in chain {chain_id} between residues {detailed_seq[i][0]} ({detailed_seq[i][1]}) and {detailed_seq[i + 1][0]} ({detailed_seq[i + 1][1]}). Estimated missing residues based on distance: {round(distance / 5)}. Distance: {distance:.2f} Å") + + @classmethod + def renumber_residues_based_on_issues_and_clean(cls, input_file: Path, out_ext: str = ".renumbered.pdb", chains: Union[List[str], str, None] = None) -> 'PDBAnalyzer': + """ + 先清洗PDB文件,再重新编号,并返回处理后的cls实例。 + + Args: + input_file (Path): PDB文件路径。 + out_ext (str): 重新编号后的输出文件扩展名,默认为".renumbered.pdb"。 + chains (Union[List[str], str, None]): 可选;指定要重新编号的链。 + + Returns: + PDBAnalyzer: 经过清洗和重新编号的PDBAnalyzer实例。 + """ + cleaned_analyzer = cls.cleanATOM(input_file) + modifications = cleaned_analyzer.find_issues_and_generate_modifications(chains) + cleaned_analyzer.batch_modify_residues(modifications) + output_file = input_file.with_suffix(out_ext) + cleaned_analyzer.save_biodata(output_file) # 假设这个方法可以将修改后的数据写入文件 + + # 返回一个新的PDBAnalyzer实例,指向重新编号后的文件 + return cls(output_file) def find_issues_and_generate_modifications(self, specified_chains=None) -> dict: """ @@ -549,6 +597,19 @@ class PDBAnalyzer: if not fasta_file.exists(): self.download_fasta() return SeqIO.parse(fasta_file.as_posix(), "fasta") + + @property + def ca_distances_median(self): + """ + 计算整个蛋白质中所有氨基酸Cα原子之间距离的中位数。 + + Returns: + float: 所有Cα原子间距离的中位数。 + """ + if self.ca_distances: + return np.median(self.ca_distances) + else: + return None @dataclass class PDBAlign: @@ -664,34 +725,7 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file): # 保存合并后的对象 cmd.save(output_file, merged_object_name) -''' -# 示例: 使用biopython提取A链(将会保留HETATM) -chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' -chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 -# 示例: 使用biopandas提取A链(将不会保留HETATM) -chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' -chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 -# A链改B链, 并分割保存为单独文件 -analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') -# 分割所有的链 -split_dict = {} -for j in analyzer.chain_id_list: - fn = Path(f'{PDB_ID}_{j}.pdb') - analyzer.split_chain(j).to_pdb(fn.as_posix()) - split_dict[j]=fn.read_text() -''' - if __name__ == "__main__": - # import argparse - # parser = argparse.ArgumentParser(description="Build model by Modeller") - # parser.add_argument("-s", "--structure", help="Structure file") - # parser.add_argument("-o", "--outdir", help="Output directory") - # parser.add_argument("-f", "--fasta", help="Fasta file") - # parser.add_argument("-n", "--num_loop", help="Number of loop model") - # parser.add_argument("-m", "--md_level", help="MD level") - # parser.add_argument("-c", "--chain", help="Chain ID") - # args = parser.parse_args() - # fix_all(Path('./pdb_test1')) pass