add modify_residue_number and change logging
This commit is contained in:
135
analysis_pdb.py
135
analysis_pdb.py
@@ -17,7 +17,7 @@ from dataclasses import dataclass, field
|
||||
from Bio.PDB import PDBParser
|
||||
from Bio.SeqUtils import seq1
|
||||
from Bio.Data import IUPACData
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
from typing import List, Dict, Tuple, Optional, Union
|
||||
from functools import reduce, partial
|
||||
from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure
|
||||
from biopandas.pdb import PandasPdb
|
||||
@@ -108,18 +108,10 @@ class PDBAnalyzer:
|
||||
"""
|
||||
Initialize the PDB structure after the object is created.
|
||||
"""
|
||||
self._initialize_logging()
|
||||
self._configure_logging()
|
||||
self._initialize_structure()
|
||||
self._initialize_properties()
|
||||
|
||||
def _initialize_logging(self):
|
||||
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
|
||||
self.logger.setLevel(logging.DEBUG)
|
||||
handler = RotatingFileHandler(f"{self.pdb_file.stem}.log", maxBytes=1048576, backupCount=5)
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
self.logger.addHandler(handler)
|
||||
|
||||
def _initialize_structure(self):
|
||||
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
|
||||
self.protein_structure = self.structure
|
||||
@@ -132,6 +124,33 @@ class PDBAnalyzer:
|
||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||
|
||||
def _configure_logging(self):
|
||||
"""配置日志系统。"""
|
||||
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
|
||||
self.logger.setLevel(logging.DEBUG) # 设置日志级别
|
||||
# 创建一个文件处理器,并设置级别和格式
|
||||
file_handler = RotatingFileHandler(self.log_file, maxBytes=1024*1024*5, backupCount=5)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
file_handler.setFormatter(formatter)
|
||||
# 添加处理器到logger
|
||||
self.logger.addHandler(file_handler)
|
||||
|
||||
def _initialize_structure(self):
|
||||
"""Initialize properties based on the pdb_file."""
|
||||
self.pdb_file_stem = self.pdb_file.stem.split('.')[0]
|
||||
self.pid = self.pdb_file_stem.lower() if len(self.pdb_file_stem) == 4 else None
|
||||
|
||||
parser = PDBParser(QUIET=True)
|
||||
self.structure = parser.get_structure(self.pdb_file_stem, self.pdb_file)
|
||||
self.protein_structure = self.structure # Assuming identical for simplified reference
|
||||
|
||||
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
|
||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||
|
||||
self.logger.info("Structure and properties initialized.")
|
||||
|
||||
@classmethod
|
||||
def cleanATOM(cls, input_file: Path, out_file: Path = None, ext: str = ".clean.pdb") -> 'PDBAnalyzer':
|
||||
"""
|
||||
@@ -160,41 +179,74 @@ class PDBAnalyzer:
|
||||
# Return a new instance of PDBAnalyzer pointing to the cleaned file
|
||||
return cls(out_file)
|
||||
|
||||
def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int) -> None:
|
||||
def check_and_log_sequence_issues(self):
|
||||
"""
|
||||
Modify the residue number for a specific chain in a PDB file.
|
||||
检测并记录每条链的编号问题。
|
||||
"""
|
||||
for chain_id, detailed_seq in self.extract_sequences(detailed=True).items():
|
||||
# 检测起始编号是否为1
|
||||
if detailed_seq[0][0] != 1:
|
||||
self.logger.warning(f"Chain {chain_id} does not start with residue number 1.")
|
||||
|
||||
# 检测编号连续性和错误
|
||||
for i in range(len(detailed_seq)-1):
|
||||
diff = detailed_seq[i+1][0] - detailed_seq[i][0]
|
||||
if diff > 10:
|
||||
self.logger.warning(f"Potential numbering error in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
|
||||
elif diff > 1:
|
||||
self.logger.warning(f"Missing residues in chain {chain_id} between residues {detailed_seq[i][0]} and {detailed_seq[i+1][0]}.")
|
||||
|
||||
def renumber_residues_based_on_issues(self, chains: Union[List[str], str, None] = None, return_pandas: bool = False):
|
||||
modifications = self.find_issues_and_generate_modifications(chains)
|
||||
self.batch_modify_residues(modifications)
|
||||
if return_pandas:
|
||||
return self.biodata
|
||||
|
||||
def find_issues_and_generate_modifications(self, specified_chains=None) -> dict:
|
||||
"""
|
||||
根据实际情况生成modifications字典,用于重新编号。
|
||||
|
||||
Args:
|
||||
chain_id (str): The chain identifier in the PDB file.
|
||||
original_res_num (int): The original residue number to find.
|
||||
new_res_num (int): The new residue number to replace with.
|
||||
specified_chains (Union[List[str], str, None]): 指定的链。
|
||||
|
||||
Returns:
|
||||
dict: 修改方案字典。
|
||||
"""
|
||||
# Check if the residue number to be replaced exists
|
||||
modifications = {}
|
||||
|
||||
# 示例逻辑:检查每条链的编号问题并生成修改方案
|
||||
for chain_id in self.chain_id_list:
|
||||
if specified_chains and chain_id not in specified_chains:
|
||||
continue # 如果指定了链且当前链不在指定列表中,则跳过
|
||||
|
||||
# 生成连续的新编号方案
|
||||
residues = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'] == chain_id]
|
||||
original_res_nums = residues['residue_number'].unique()
|
||||
sorted_original_res_nums = sorted(original_res_nums)
|
||||
|
||||
# 检查并生成修改方案
|
||||
new_res_nums = range(1, len(sorted_original_res_nums) + 1)
|
||||
changes = zip(sorted_original_res_nums, new_res_nums)
|
||||
modifications[chain_id] = list(changes)
|
||||
|
||||
return modifications
|
||||
|
||||
def modify_residue_number(self, chain_id: str, original_res_num: int, new_res_num: int, return_pandas: bool = False):
|
||||
mask = (self.biodata.df['ATOM']['chain_id'] == chain_id) & \
|
||||
(self.biodata.df['ATOM']['residue_number'] == original_res_num)
|
||||
if not mask.any():
|
||||
raise ValueError(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
|
||||
|
||||
# Update the residue number
|
||||
self.logger.warning(f"Residue number {original_res_num} in chain {chain_id} does not exist in the PDB file.")
|
||||
return
|
||||
self.biodata.df['ATOM'].loc[mask, 'residue_number'] = new_res_num
|
||||
if return_pandas:
|
||||
return self.biodata
|
||||
|
||||
def batch_modify_residues(self, modifications: dict) -> PandasPdb:
|
||||
"""
|
||||
Batch modify multiple residues in a PDB file based on a dictionary of modifications.
|
||||
|
||||
Args:
|
||||
modifications (dict): A dictionary with chain identifiers as keys and a list of
|
||||
tuples (original_res_num, new_res_num) as values.
|
||||
|
||||
Returns:
|
||||
PandasPdb: The modified PDB structure as a PandasPdb object.
|
||||
"""
|
||||
def batch_modify_residues(self, modifications: dict, return_pandas: bool = False):
|
||||
for chain_id, changes in modifications.items():
|
||||
for original_res_num, new_res_num in changes:
|
||||
self.modify_residue_number(chain_id, original_res_num, new_res_num)
|
||||
|
||||
# Return the modified PandasPdb object
|
||||
return self.biodata
|
||||
if return_pandas:
|
||||
return self.biodata
|
||||
|
||||
def extract_chains_to_new_pdb(self, chains: List[str]) -> PandasPdb:
|
||||
"""
|
||||
@@ -377,19 +429,10 @@ class PDBAnalyzer:
|
||||
loops_by_chain[chain_id] = loops
|
||||
return loops_by_chain
|
||||
|
||||
def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb:
|
||||
"""
|
||||
Change the identifier of a specific chain. default use biopandas , not test yet.
|
||||
|
||||
Args:
|
||||
old_chain_id (str): Original identifier of the chain.
|
||||
new_chain_id (str): New identifier to be assigned to the chain.
|
||||
return:
|
||||
object: An new object,which not influce original self.biodata, containing the changed chain with a save method.
|
||||
"""
|
||||
biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum
|
||||
biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper())
|
||||
return biodata # use to_pdb to save
|
||||
def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, return_pandas: bool = False):
|
||||
self.biodata.df['ATOM']['chain_id'] = self.biodata.df['ATOM']['chain_id'].replace(old_chain_id, new_chain_id)
|
||||
if return_pandas:
|
||||
return self.biodata
|
||||
|
||||
def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO:
|
||||
io = PDBIO()
|
||||
|
||||
Reference in New Issue
Block a user