add modify_residue_number and change logging

This commit is contained in:
2024-03-07 14:26:53 +08:00
parent 74b31db5a5
commit ced22ac37b

View File

@@ -17,7 +17,7 @@ from dataclasses import dataclass, field
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from Bio.Data import IUPACData
from typing import List, Dict, Tuple, Optional
from typing import List, Dict, Tuple, Optional, Union
from functools import reduce, partial
from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure
from biopandas.pdb import PandasPdb
@@ -108,18 +108,10 @@ class PDBAnalyzer:
"""
Initialize the PDB structure after the object is created.
"""
self._initialize_logging()
self._configure_logging()
self._initialize_structure()
self._initialize_properties()
def _initialize_logging(self):
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
self.logger.setLevel(logging.DEBUG)
handler = RotatingFileHandler(f"{self.pdb_file.stem}.log", maxBytes=1048576, backupCount=5)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
self.logger.addHandler(handler)
def _initialize_structure(self):
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
self.protein_structure = self.structure
@@ -132,6 +124,33 @@ class PDBAnalyzer:
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
def _configure_logging(self):
"""配置日志系统。"""
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
self.logger.setLevel(logging.DEBUG) # 设置日志级别
# 创建一个文件处理器,并设置级别和格式
file_handler = RotatingFileHandler(self.log_file, maxBytes=1024*1024*5, backupCount=5)
file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
# 添加处理器到logger
self.logger.addHandler(file_handler)
def _initialize_structure(self):
"""Initialize properties based on the pdb_file."""
self.pdb_file_stem = self.pdb_file.stem.split('.')[0]
self.pid = self.pdb_file_stem.lower() if len(self.pdb_file_stem) == 4 else None
parser = PDBParser(QUIET=True)
self.structure = parser.get_structure(self.pdb_file_stem, self.pdb_file)
self.protein_structure = self.structure # Assuming identical for simplified reference
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
self.logger.info("Structure and properties initialized.")
@classmethod
def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer':
"""
@@ -160,41 +179,74 @@ class PDBAnalyzer:
# Return a new instance of PDBAnalyzer pointing to the cleaned file
return cls(out_file)
def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int) -> None:
def check_and_log_sequence_issues(self):
"""
Modify the residue number for a specific chain in a PDB file.
检测并记录每条链的编号问题。
"""
for chain_id, detailed_seq in self.extract_sequences(detailed=True).items():
# 检测起始编号是否为1
if detailed_seq[0][0] != 1:
self.logger.warning(f"Chain {chain_id} does not start with residue number 1.")
# 检测编号连续性和错误
for i in range(len(detailed_seq)-1):
diff = detailed_seq[i+1][0] - detailed_seq[i][0]
if diff > 10:
self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
elif diff > 1:
self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False):
modifications = self.find_issues_and_generate_modifications(chains)
self.batch_modify_residues(modifications)
if return_pandas:
return self.biodata
def find_issues_and_generate_modifications(self, specified_chains=None) -> dict:
"""
根据实际情况生成modifications字典用于重新编号。
Args:
chain_id (str): The chain identifier in the PDB file.
original_res_num (int): The original residue number to find.
new_res_num (int): The new residue number to replace with.
specified_chains (Union[List[str], str, None]): 指定的链。
Returns:
dict: 修改方案字典。
"""
# Check if the residue number to be replaced exists
modifications = {}
# 示例逻辑:检查每条链的编号问题并生成修改方案
for chain_id in self.chain_id_list:
if specified_chains and chain_id not in specified_chains:
continue # 如果指定了链且当前链不在指定列表中,则跳过
# 生成连续的新编号方案
residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id]
original_res_nums = residues['residue_number'].unique()
sorted_original_res_nums = sorted(original_res_nums)
# 检查并生成修改方案
new_res_nums = range(1, len(sorted_original_res_nums) + 1)
changes = zip(sorted_original_res_nums, new_res_nums)
modifications[chain_id] = list(changes)
return modifications
def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int, return_pandas: bool = False):
mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \
(self.biodata.df['ATOM']['residue_number'] == original_res_num)
if not mask.any():
raise ValueError(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
# Update the residue number
self.logger.warning(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
return
self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num
if return_pandas:
return self.biodata
def batch_modify_residues(self, modifications: dict) -> PandasPdb:
"""
Batch modify multiple residues in a PDB file based on a dictionary of modifications.
Args:
modifications (dict): A dictionary with chain identifiers as keys and a list of
tuples (original_res_num, new_res_num) as values.
Returns:
PandasPdb: The modified PDB structure as a PandasPdb object.
"""
def batch_modify_residues(self, modifications: dict, return_pandas: bool = False):
for chain_id, changes in modifications.items():
for original_res_num, new_res_num in changes:
self.modify_residue_number(chain_id, original_res_num, new_res_num)
# Return the modified PandasPdb object
return self.biodata
if return_pandas:
return self.biodata
def extract_chains_to_new_pdb(self, chains: List[str]) -> PandasPdb:
"""
@@ -377,19 +429,10 @@ class PDBAnalyzer:
loops_by_chain[chain_id] = loops
return loops_by_chain
def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb:
"""
Change the identifier of a specific chain. default use biopandas , not test yet.
Args:
old_chain_id (str): Original identifier of the chain.
new_chain_id (str): New identifier to be assigned to the chain.
return:
object: An new object,which not influce original self.biodata, containing the changed chain with a save method.
"""
biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum
biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper())
return biodata # use to_pdb to save
def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, return_pandas: bool = False):
self.biodata.df['ATOM']['chain_id'] = self.biodata.df['ATOM']['chain_id'].replace(old_chain_id, new_chain_id)
if return_pandas:
return self.biodata
def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO:
io = PDBIO()