From ced22ac37b8a17a8fb95dd198e6f2e7db1add055 Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Thu, 7 Mar 2024 14:26:53 +0800
Subject: [PATCH] add modify_residue_number and change logging

---
 analysis_pdb.py | 135 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 46 deletions(-)

diff --git a/analysis_pdb.py b/analysis_pdb.py
index def6a5e..74e7599 100755
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -17,7 +17,7 @@ from dataclasses import dataclass, field
 from Bio.PDB import PDBParser
 from Bio.SeqUtils import seq1
 from Bio.Data import IUPACData
-from typing import List, Dict, Tuple, Optional
+from typing import List, Dict, Tuple, Optional, Union
 from functools import reduce, partial
 from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure
 from biopandas.pdb import PandasPdb
@@ -108,18 +108,10 @@ class PDBAnalyzer:
         """ 
         Initialize the PDB structure after the object is created.
         """
-        self._initialize_logging()
+        self._configure_logging()
         self._initialize_structure()
         self._initialize_properties()
         
-    def _initialize_logging(self):
-        self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
-        self.logger.setLevel(logging.DEBUG)
-        handler = RotatingFileHandler(f"{self.pdb_file.stem}.log", maxBytes=1048576, backupCount=5)
-        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
-        handler.setFormatter(formatter)
-        self.logger.addHandler(handler)
-        
     def _initialize_structure(self):
         self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
         self.protein_structure = self.structure
@@ -131,6 +123,33 @@ class PDBAnalyzer:
         self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
         self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
         self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
+        
+    def _configure_logging(self):
+        """配置日志系统。"""
+        self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
+        self.logger.setLevel(logging.DEBUG)  # 设置日志级别
+        # 创建一个文件处理器，并设置级别和格式
+        file_handler = RotatingFileHandler(self.log_file, maxBytes=1024*1024*5, backupCount=5)
+        file_handler.setLevel(logging.DEBUG)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        file_handler.setFormatter(formatter)
+        # 添加处理器到logger
+        self.logger.addHandler(file_handler)
+
+    def _initialize_structure(self):
+        """Initialize properties based on the pdb_file."""
+        self.pdb_file_stem = self.pdb_file.stem.split('.')[0]
+        self.pid = self.pdb_file_stem.lower() if len(self.pdb_file_stem) == 4 else None
+        
+        parser = PDBParser(QUIET=True)
+        self.structure = parser.get_structure(self.pdb_file_stem, self.pdb_file)
+        self.protein_structure = self.structure  # Assuming identical for simplified reference
+
+        self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
+        self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
+        self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
+
+        self.logger.info("Structure and properties initialized.")
 
     @classmethod
     def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer':
@@ -160,41 +179,74 @@ class PDBAnalyzer:
         # Return a new instance of PDBAnalyzer pointing to the cleaned file
         return cls(out_file)
     
-    def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int) -> None:
+    def check_and_log_sequence_issues(self):
         """
-        Modify the residue number for a specific chain in a PDB file.
+        检测并记录每条链的编号问题。
+        """
+        for chain_id, detailed_seq in self.extract_sequences(detailed=True).items():
+            # 检测起始编号是否为1
+            if detailed_seq[0][0] != 1:
+                self.logger.warning(f"Chain {chain_id} does not start with residue number 1.")
+            
+            # 检测编号连续性和错误
+            for i in range(len(detailed_seq)-1):
+                diff = detailed_seq[i+1][0] - detailed_seq[i][0]
+                if diff > 10:
+                    self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
+                elif diff > 1:
+                    self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
+    
+    def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False):
+        modifications = self.find_issues_and_generate_modifications(chains)
+        self.batch_modify_residues(modifications)
+        if return_pandas:
+            return self.biodata
+
+    def find_issues_and_generate_modifications(self, specified_chains=None) -> dict:
+        """
+        根据实际情况生成modifications字典，用于重新编号。
 
         Args:
-        chain_id (str): The chain identifier in the PDB file.
-        original_res_num (int): The original residue number to find.
-        new_res_num (int): The new residue number to replace with.
+        specified_chains (Union[List[str], str, None]): 指定的链。
+
+        Returns:
+        dict: 修改方案字典。
         """
-        # Check if the residue number to be replaced exists
+        modifications = {}
+
+        # 示例逻辑：检查每条链的编号问题并生成修改方案
+        for chain_id in self.chain_id_list:
+            if specified_chains and chain_id not in specified_chains:
+                continue  # 如果指定了链且当前链不在指定列表中，则跳过
+
+            # 生成连续的新编号方案
+            residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id]
+            original_res_nums = residues['residue_number'].unique()
+            sorted_original_res_nums = sorted(original_res_nums)
+            
+            # 检查并生成修改方案
+            new_res_nums = range(1, len(sorted_original_res_nums) + 1)
+            changes = zip(sorted_original_res_nums, new_res_nums)
+            modifications[chain_id] = list(changes)
+
+        return modifications
+    
+    def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int, return_pandas: bool = False):
         mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \
                (self.biodata.df['ATOM']['residue_number'] == original_res_num)
         if not mask.any():
-            raise ValueError(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
-
-        # Update the residue number
+            self.logger.warning(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
+            return
         self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num
+        if return_pandas:
+            return self.biodata
 
-    def batch_modify_residues(self, modifications: dict) -> PandasPdb:
-        """
-        Batch modify multiple residues in a PDB file based on a dictionary of modifications.
-
-        Args:
-        modifications (dict): A dictionary with chain identifiers as keys and a list of
-                              tuples (original_res_num, new_res_num) as values.
-
-        Returns:
-        PandasPdb: The modified PDB structure as a PandasPdb object.
-        """
+    def batch_modify_residues(self, modifications: dict, return_pandas: bool = False):
         for chain_id, changes in modifications.items():
             for original_res_num, new_res_num in changes:
                 self.modify_residue_number(chain_id, original_res_num, new_res_num)
-
-        # Return the modified PandasPdb object
-        return self.biodata
+        if return_pandas:
+            return self.biodata
 
     def extract_chains_to_new_pdb(self, chains: List[str]) -> PandasPdb:
         """
@@ -377,19 +429,10 @@ class PDBAnalyzer:
             loops_by_chain[chain_id] = loops
         return loops_by_chain
     
-    def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb:
-        """
-        Change the identifier of a specific chain. default use biopandas , not test yet. 
-
-        Args:
-        old_chain_id (str): Original identifier of the chain.
-        new_chain_id (str): New identifier to be assigned to the chain.
-        return:
-        object: An new object,which not influce original self.biodata, containing the changed chain with a save method.
-        """
-        biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum 
-        biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper())
-        return biodata # use to_pdb to save
+    def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, return_pandas: bool = False):
+        self.biodata.df['ATOM']['chain_id'] = self.biodata.df['ATOM']['chain_id'].replace(old_chain_id, new_chain_id)
+        if return_pandas:
+            return self.biodata
     
     def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO:
         io = PDBIO()