add log_numbering_error

This commit is contained in:
2024-03-07 17:50:43 +08:00
parent 9d9813cbfe
commit 7b1a0cddb3

View File

@@ -106,6 +106,7 @@ class PDBAnalyzer:
log_file: Path = field(init=False)
logger: logging.Logger = field(init=False)
ca_distances: List[float] = field(init=False)
renumber_errors: List[Dict[str, Tuple[int, int, int]]] = field(default_factory=list)
def __post_init__(self):
"""
@@ -197,6 +198,55 @@ class PDBAnalyzer:
# Return a new instance of PDBAnalyzer pointing to the cleaned file
return cls(out_file)
def collect_renumbering_info(self):
"""
收集需要重新编号的残基信息。
"""
self.renumbering_info = {} # Chain ID as key, list of (original_res_num, new_res_num) tuples as value
for chain_id in self.chain_id_list:
residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id]
unique_residues = residues['residue_number'].unique()
sorted_residues = sorted(unique_residues)
new_res_num = 1
self.renumbering_info[chain_id] = []
for orig_res_num in sorted_residues:
self.renumbering_info[chain_id].append((orig_res_num, new_res_num))
new_res_num += 1
def log_numbering_error(self, chain_id: str, start_residue: int, end_residue: int, estimated_missing: int):
"""
记录编号错误的信息。
Args:
chain_id (str): 链的ID。
start_residue (int): 错误开始的残基编号。
end_residue (int): 错误结束的残基编号。
estimated_missing (int): 估计的缺失残基数量。
"""
self.renumber_errors.append({
"chain_id": chain_id,
"start_residue": start_residue,
"end_residue": end_residue,
"estimated_missing": estimated_missing
})
# def renumber_residues(self):
# """
# 根据收集的残基信息重新编号所有残基。
# """
# for chain_id, res_pairs in self.renumbering_info.items():
# for orig_res_num, new_res_num in res_pairs:
# self.modify_residue_number(chain_id, orig_res_num, new_res_num)
# def modify_residue_number(self, chain_id, original_res_num, new_res_num):
# """
# 修改指定链上的残基编号。
# """
# mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \
# (self.biodata.df['ATOM']['residue_number'] == original_res_num)
# self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num
def check_and_log_sequence_issues(self):
"""
检测并记录每条链的编号问题,并计算相邻残基间的距离。
@@ -224,6 +274,7 @@ class PDBAnalyzer:
else:
missing_number = int(np.round(distance / 3.8 ) - 1)
self.logger.warning(f"Wrong sequence numbering in chain {chain_id} between residues {prev_res[0]} and {next_res[0]}, distance: {distance:.2f} Å, missing residue number: {missing_number}")
self.log_numbering_error(chain_id, prev_res[0], next_res[0], missing_number)
@classmethod
def renumber_residues_based_on_issues_and_clean(cls, input_file: Path, out_ext: str = ".renumbered.pdb", chains: Union[List[str], str, None] = None) -> 'PDBAnalyzer':