From ced22ac37b8a17a8fb95dd198e6f2e7db1add055 Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Thu, 7 Mar 2024 14:26:53 +0800 Subject: [PATCH] add modify_residue_number and change logging --- analysis_pdb.py | 135 +++++++++++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 46 deletions(-) diff --git a/analysis_pdb.py b/analysis_pdb.py index def6a5e..74e7599 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -17,7 +17,7 @@ from dataclasses import dataclass, field from Bio.PDB import PDBParser from Bio.SeqUtils import seq1 from Bio.Data import IUPACData -from typing import List, Dict, Tuple, Optional +from typing import List, Dict, Tuple, Optional, Union from functools import reduce, partial from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure from biopandas.pdb import PandasPdb @@ -108,18 +108,10 @@ class PDBAnalyzer: """ Initialize the PDB structure after the object is created. """ - self._initialize_logging() + self._configure_logging() self._initialize_structure() self._initialize_properties() - def _initialize_logging(self): - self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}") - self.logger.setLevel(logging.DEBUG) - handler = RotatingFileHandler(f"{self.pdb_file.stem}.log", maxBytes=1048576, backupCount=5) - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - self.logger.addHandler(handler) - def _initialize_structure(self): self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix()) self.protein_structure = self.structure @@ -131,6 +123,33 @@ class PDBAnalyzer: self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix()) self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + + def _configure_logging(self): + """配置日志系统。""" + self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}") + self.logger.setLevel(logging.DEBUG) # 设置日志级别 + # 创建一个文件处理器,并设置级别和格式 + file_handler = RotatingFileHandler(self.log_file, maxBytes=1024*1024*5, backupCount=5) + file_handler.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + # 添加处理器到logger + self.logger.addHandler(file_handler) + + def _initialize_structure(self): + """Initialize properties based on the pdb_file.""" + self.pdb_file_stem = self.pdb_file.stem.split('.')[0] + self.pid = self.pdb_file_stem.lower() if len(self.pdb_file_stem) == 4 else None + + parser = PDBParser(QUIET=True) + self.structure = parser.get_structure(self.pdb_file_stem, self.pdb_file) + self.protein_structure = self.structure # Assuming identical for simplified reference + + self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix()) + self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' + self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + + self.logger.info("Structure and properties initialized.") @classmethod def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer': @@ -160,41 +179,74 @@ class PDBAnalyzer: # Return a new instance of PDBAnalyzer pointing to the cleaned file return cls(out_file) - def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int) -> None: + def check_and_log_sequence_issues(self): """ - Modify the residue number for a specific chain in a PDB file. + 检测并记录每条链的编号问题。 + """ + for chain_id, detailed_seq in self.extract_sequences(detailed=True).items(): + # 检测起始编号是否为1 + if detailed_seq[0][0] != 1: + self.logger.warning(f"Chain {chain_id} does not start with residue number 1.") + + # 检测编号连续性和错误 + for i in range(len(detailed_seq)-1): + diff = detailed_seq[i+1][0] - detailed_seq[i][0] + if diff > 10: + self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.") + elif diff > 1: + self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.") + + def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False): + modifications = self.find_issues_and_generate_modifications(chains) + self.batch_modify_residues(modifications) + if return_pandas: + return self.biodata + + def find_issues_and_generate_modifications(self, specified_chains=None) -> dict: + """ + 根据实际情况生成modifications字典,用于重新编号。 Args: - chain_id (str): The chain identifier in the PDB file. - original_res_num (int): The original residue number to find. - new_res_num (int): The new residue number to replace with. + specified_chains (Union[List[str], str, None]): 指定的链。 + + Returns: + dict: 修改方案字典。 """ - # Check if the residue number to be replaced exists + modifications = {} + + # 示例逻辑:检查每条链的编号问题并生成修改方案 + for chain_id in self.chain_id_list: + if specified_chains and chain_id not in specified_chains: + continue # 如果指定了链且当前链不在指定列表中,则跳过 + + # 生成连续的新编号方案 + residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id] + original_res_nums = residues['residue_number'].unique() + sorted_original_res_nums = sorted(original_res_nums) + + # 检查并生成修改方案 + new_res_nums = range(1, len(sorted_original_res_nums) + 1) + changes = zip(sorted_original_res_nums, new_res_nums) + modifications[chain_id] = list(changes) + + return modifications + + def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int, return_pandas: bool = False): mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \ (self.biodata.df['ATOM']['residue_number'] == original_res_num) if not mask.any(): - raise ValueError(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.") - - # Update the residue number + self.logger.warning(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.") + return self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num + if return_pandas: + return self.biodata - def batch_modify_residues(self, modifications: dict) -> PandasPdb: - """ - Batch modify multiple residues in a PDB file based on a dictionary of modifications. - - Args: - modifications (dict): A dictionary with chain identifiers as keys and a list of - tuples (original_res_num, new_res_num) as values. - - Returns: - PandasPdb: The modified PDB structure as a PandasPdb object. - """ + def batch_modify_residues(self, modifications: dict, return_pandas: bool = False): for chain_id, changes in modifications.items(): for original_res_num, new_res_num in changes: self.modify_residue_number(chain_id, original_res_num, new_res_num) - - # Return the modified PandasPdb object - return self.biodata + if return_pandas: + return self.biodata def extract_chains_to_new_pdb(self, chains: List[str]) -> PandasPdb: """ @@ -377,19 +429,10 @@ class PDBAnalyzer: loops_by_chain[chain_id] = loops return loops_by_chain - def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb: - """ - Change the identifier of a specific chain. default use biopandas , not test yet. - - Args: - old_chain_id (str): Original identifier of the chain. - new_chain_id (str): New identifier to be assigned to the chain. - return: - object: An new object,which not influce original self.biodata, containing the changed chain with a save method. - """ - biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum - biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper()) - return biodata # use to_pdb to save + def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, return_pandas: bool = False): + self.biodata.df['ATOM']['chain_id'] = self.biodata.df['ATOM']['chain_id'].replace(old_chain_id, new_chain_id) + if return_pandas: + return self.biodata def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO: io = PDBIO()