Files
analysis_pdb/modelbuilder.py

220 lines
9.8 KiB
Python

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :modelbuilder.py
@Description: :
@Date :2024/01/11 15:29:28
@Author :lyzeng
@Email :pylyzeng@gmail.com
@version :1.0
'''
import argparse
from dataclasses import dataclass, field
from pathlib import Path
import logging
from analysis_pdb import PDBAnalyzer
from build_modeller import PDBModeler
from modeller import ModellerError
import pymol
from typing import Dict
from Bio.SeqRecord import SeqRecord
@dataclass
class PDBAlign:
template_file: Path
target_file: Path
pymol_instance: object = field(init=False)
out_file: Path = field(default=None) # 输出文件路径
def __post_init__(self):
self.initialize_pymol()
def initialize_pymol(self):
self.pymol_instance = pymol.cmd
self.pymol_instance.reinitialize()
def align(self) -> str:
self.pymol_instance.reinitialize()
# 首先,加载模板结构
self.pymol_instance.load(self.template_file.as_posix(), "template")
# 加载并对齐所有目标结构
self.pymol_instance.load(self.target_file.as_posix(), "target")
self.pymol_instance.align("target", "template")
return self.pymol_instance.get_pdbstr('target')
def save(self, out_file: Path):
self.pymol_instance.save(out_file.as_posix(), "target")
@dataclass
class LoopModelBuilder:
pdb_file: Path
output_dir: Path = field(default=None)
logger: logging.Logger = field(init=False)
pymol_instance: object = field(init=False)
analyzer_instance: PDBAnalyzer = field(init=False)
pdb_id: str = field(init=False)
use_pymol_align_merge: bool = False # 控制是否使用 PyMOL 进行对齐和合并的标志
def __post_init__(self):
self.pdb_id = self.pdb_file.stem
self.output_dir = self.pdb_file.parent if self.output_dir is None else self.output_dir
self.analyzer_instance = PDBAnalyzer(self.pdb_file)
self.setup_logging()
self.initialize_pymol()
def setup_logging(self):
log_file = self.output_dir / "loop_modeling.log"
if log_file.exists():
log_file.unlink()
# 创建 logger 实例
self.logger = logging.getLogger('LoopModelBuilder')
self.logger.setLevel(logging.INFO)
# 创建 file handler
file_handler = logging.FileHandler(log_file, mode='a')
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
# 创建 console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
# 将 handlers 添加到 logger
self.logger.addHandler(file_handler)
self.logger.addHandler(console_handler)
def initialize_pymol(self):
self.pymol_instance = pymol.cmd
self.pymol_instance.reinitialize()
def split_chains(self) -> dict:
split_dict = {}
for chain_id in self.analyzer_instance.chain_id_list:
self.logger.info(f"Chain {chain_id} info(from file not fasta): \n{self.sequences[chain_id]}")
chain_file = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{chain_id}.pdb")
self.analyzer_instance.split_chain(chain_id).to_pdb(chain_file.as_posix())
split_dict[chain_id] = chain_file.read_text()
return split_dict
def merge_and_save_pdb(self, pdb_strings: dict, pdb_id: str):
merged_file = Path(f"{pdb_id}_merged.pdb")
self.import_and_merge_pdb_strings(pdb_strings, "merged_object", merged_file)
@property
def sequences(self) -> Dict:
return self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
@property
def missing_info(self) -> Dict:
return self.analyzer_instance.extract_sequences_info()
def split_all_chains(self):
self.logger.info(f'Residues info for {self.pdb_file}: \n',self.sequences)
self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}')
split_dict = {} # split all chains
for j in self.analyzer_instance.chain_id_list:
self.logger.info(f"Chain {j} info(from file not fasta): \n{self.sequences[j]}")
fn = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{j}.pdb")
self.analyzer_instance.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
return split_dict
def model_missing_loops(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> dict:
mc_dict = {}
if not self.missing_info:
self.logger.info(f'No missing residues found in {self.pdb_file}. Skipping model_missing_loops.')
return mc_dict
self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}')
# create workdir
for mc in self.missing_info.keys():
self.logger.info(f'Building model for chain {mc}')
workdir = self.pdb_file.parent.joinpath(f'./{self.pdb_id}modellerfix_{mc}')
workdir.mkdir(exist_ok=True, parents=True)
mc_fasta = self.analyzer_instance.find_most_similar(self.sequences[mc]) # get misschain fasta file
# write fasta file
mc_fasta_record = SeqRecord(mc_fasta, id=f'{mc}', description=f'{self.pdb_id}_{mc}')
out_fasta_file = workdir.joinpath(f'{self.analyzer_instance.pid}_{mc}.fasta')
self.analyzer_instance.write_seq_to_fasta_single_line(mc_fasta_record, out_fasta_file)
self.logger.info(f'>{self.pdb_id}|{mc}|missing site:{self.missing_info[mc]}|length:{len(mc_fasta)}')
self.logger.info(mc_fasta)
# build model
modeller = PDBModeler(self.pdb_file, out_fasta_file, workdir, mc, buildnumber, typestr)
try:
modeller_results = modeller.make_model()
except ModellerError as mod_err:
self.logger.info(f'Failed to build model for chain {mc}')
self.logger.info(f'No loops detected in {out_fasta_file.name}')
self.logger.info(f'may pdb file sequence is not correct')
self.logger.error(f'Modeller error for chain {mc}: {mod_err}')
continue
except Exception as e:
self.logger.error(f'Unexpected error in model_missing_loops: {e}')
self.logger.info(f'Model files: {[file.name for file in modeller_results]}')
# change id to original
for i in modeller_results:
self.change_chain_identifier(i, 'A', mc, split=False)
# Use PyMOL to align and merge, controlled by use_pymol_align_merge flag
if self.use_pymol_align_merge:
self.logger.warning("PyMOL alignment and merging is disabled or multiple model files exist. "
"This may result in clashes between multiple protein chains, leading to failures in MD simulations. "
"Please ensure the structures are compatible for simulations.")
if len(modeller_results) == 1:
aligner = PDBAlign(self.pdb_file, modeller_results[0])
pdbstr = aligner.align()
mc_dict.update({mc: pdbstr})
else:
self.logger.warning('more than one model file, please set num_loop to 1')
return mc_dict
@staticmethod
def change_chain_identifier(pdb_file: Path, chain_id:str, new_chain_id:str, split:bool = True) -> Path:
o = PDBAnalyzer(pdb_file)
o.change_chain_identifier(chain_id, new_chain_id, split=split).to_pdb(pdb_file)
return pdb_file
def run(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> Path:
split_dict = self.split_all_chains()
mc_dict = self.model_missing_loops(typestr=typestr, buildnumber=buildnumber)
split_dict.update(mc_dict) # 更新 split_dict
out_file = self.output_dir.joinpath(f'{self.analyzer_instance.pid}.modellerfix.pdb')
self.import_and_merge_pdb_strings(split_dict, "merged_object", out_file.as_posix())
def import_and_merge_pdb_strings(self, pdb_strings, merged_object_name, output_file):
# 使用 PyMOL 实例导入和合并 PDB
for chain_id, pdb_str in pdb_strings.items():
object_name = f"chain_{chain_id}"
self.pymol_instance.read_pdbstr(pdb_str, object_name)
object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()]
self.pymol_instance.create(merged_object_name, ' or '.join(object_names))
self.pymol_instance.save(output_file, merged_object_name)
def main():
# 创建ArgumentParser对象
parser = argparse.ArgumentParser(description="Loop Model Builder for PDB files")
# 添加参数
parser.add_argument("pdb_file", type=str, help="Path to the PDB file")
parser.add_argument("--typestr", type=str, default="refine.very_fast",
help="Model refinement type (default: refine.very_fast)")
parser.add_argument("--output_dir", type=str, default=None,
help="Output directory (default: same as PDB file directory)")
parser.add_argument("--use_pymol_align_merge", action='store_true', help="Use PyMOL for alignment and merging. Note: This may result in clashes between multiple protein chains, leading to failures in MD simulations.")
# 解析参数
args = parser.parse_args()
# 调用LoopModelBuilder
pdb_path = Path(args.pdb_file).absolute()
output_dir = Path(args.output_dir).absolute() if args.output_dir else None
model_builder = LoopModelBuilder(pdb_file=pdb_path, output_dir=output_dir, use_pymol_align_merge=args.use_pymol_align_merge)
result_path = model_builder.run(typestr=args.typestr)
print(f"Modeling completed. Output file: {result_path}")
if __name__ == "__main__":
main()