diff --git a/analysis_pdb.py b/analysis_pdb.py index 0f2fc99..a271486 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -106,6 +106,7 @@ class PDBAnalyzer: log_file: Path = field(init=False) logger: logging.Logger = field(init=False) ca_distances: List[float] = field(init=False) + renumber_errors: List[Dict[str, Tuple[int, int, int]]] = field(default_factory=list) def __post_init__(self): """ @@ -197,6 +198,55 @@ class PDBAnalyzer: # Return a new instance of PDBAnalyzer pointing to the cleaned file return cls(out_file) + def collect_renumbering_info(self): + """ + 收集需要重新编号的残基信息。 + """ + self.renumbering_info = {} # Chain ID as key, list of (original_res_num, new_res_num) tuples as value + for chain_id in self.chain_id_list: + residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id] + unique_residues = residues['residue_number'].unique() + sorted_residues = sorted(unique_residues) + + new_res_num = 1 + self.renumbering_info[chain_id] = [] + for orig_res_num in sorted_residues: + self.renumbering_info[chain_id].append((orig_res_num, new_res_num)) + new_res_num += 1 + + def log_numbering_error(self, chain_id: str, start_residue: int, end_residue: int, estimated_missing: int): + """ + 记录编号错误的信息。 + + Args: + chain_id (str): 链的ID。 + start_residue (int): 错误开始的残基编号。 + end_residue (int): 错误结束的残基编号。 + estimated_missing (int): 估计的缺失残基数量。 + """ + self.renumber_errors.append({ + "chain_id": chain_id, + "start_residue": start_residue, + "end_residue": end_residue, + "estimated_missing": estimated_missing + }) + + # def renumber_residues(self): + # """ + # 根据收集的残基信息重新编号所有残基。 + # """ + # for chain_id, res_pairs in self.renumbering_info.items(): + # for orig_res_num, new_res_num in res_pairs: + # self.modify_residue_number(chain_id, orig_res_num, new_res_num) + + # def modify_residue_number(self, chain_id, original_res_num, new_res_num): + # """ + # 修改指定链上的残基编号。 + # """ + # mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \ + # (self.biodata.df['ATOM']['residue_number'] == original_res_num) + # self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num + def check_and_log_sequence_issues(self): """ 检测并记录每条链的编号问题,并计算相邻残基间的距离。 @@ -224,6 +274,7 @@ class PDBAnalyzer: else: missing_number = int(np.round(distance / 3.8 ) - 1) self.logger.warning(f"Wrong sequence numbering in chain {chain_id} between residues {prev_res[0]} and {next_res[0]}, distance: {distance:.2f} Å, missing residue number: {missing_number}") + self.log_numbering_error(chain_id, prev_res[0], next_res[0], missing_number) @classmethod def renumber_residues_based_on_issues_and_clean(cls, input_file: Path, out_ext: str = ".renumbered.pdb", chains: Union[List[str], str, None] = None) -> 'PDBAnalyzer':