#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :modelbuilder.py @Description: : @Date :2024/01/11 15:29:28 @Author :lyzeng @Email :pylyzeng@gmail.com @version :1.0 ''' import argparse from dataclasses import dataclass, field from pathlib import Path import logging from analysis_pdb import PDBAnalyzer from build_modeller import PDBModeler from modeller import ModellerError import pymol from typing import Dict from Bio.SeqRecord import SeqRecord @dataclass class PDBAlign: template_file: Path target_file: Path pymol_instance: object = field(init=False) out_file: Path = field(default=None) # 输出文件路径 def __post_init__(self): self.initialize_pymol() def initialize_pymol(self): self.pymol_instance = pymol.cmd self.pymol_instance.reinitialize() def align(self) -> str: self.pymol_instance.reinitialize() # 首先,加载模板结构 self.pymol_instance.load(self.template_file.as_posix(), "template") # 加载并对齐所有目标结构 self.pymol_instance.load(self.target_file.as_posix(), "target") self.pymol_instance.align("target", "template") return self.pymol_instance.get_pdbstr('target') def save(self, out_file: Path): self.pymol_instance.save(out_file.as_posix(), "target") @dataclass class LoopModelBuilder: pdb_file: Path output_dir: Path = field(default=None) logger: logging.Logger = field(init=False) pymol_instance: object = field(init=False) analyzer_instance: PDBAnalyzer = field(init=False) pdb_id: str = field(init=False) use_pymol_align_merge: bool = False # 控制是否使用 PyMOL 进行对齐和合并的标志 def __post_init__(self): self.pdb_id = self.pdb_file.stem self.output_dir = self.pdb_file.parent if self.output_dir is None else self.output_dir self.analyzer_instance = PDBAnalyzer(self.pdb_file) self.setup_logging() self.initialize_pymol() def setup_logging(self): log_file = self.output_dir / "loop_modeling.log" if log_file.exists(): log_file.unlink() # 创建 logger 实例 self.logger = logging.getLogger('LoopModelBuilder') self.logger.setLevel(logging.INFO) # 创建 file handler file_handler = logging.FileHandler(log_file, mode='a') file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) # 创建 console handler console_handler = logging.StreamHandler() console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) # 将 handlers 添加到 logger self.logger.addHandler(file_handler) self.logger.addHandler(console_handler) def initialize_pymol(self): self.pymol_instance = pymol.cmd self.pymol_instance.reinitialize() def split_chains(self) -> dict: split_dict = {} for chain_id in self.analyzer_instance.chain_id_list: self.logger.info(f"Chain {chain_id} info(from file not fasta): \n{self.sequences[chain_id]}") chain_file = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{chain_id}.pdb") self.analyzer_instance.split_chain(chain_id).to_pdb(chain_file.as_posix()) split_dict[chain_id] = chain_file.read_text() return split_dict def merge_and_save_pdb(self, pdb_strings: dict, pdb_id: str): merged_file = Path(f"{pdb_id}_merged.pdb") self.import_and_merge_pdb_strings(pdb_strings, "merged_object", merged_file) @property def sequences(self) -> Dict: return self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 '' @property def missing_info(self) -> Dict: return self.analyzer_instance.extract_sequences_info() def split_all_chains(self): self.logger.info(f'Residues info for {self.pdb_file}: \n',self.sequences) self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}') split_dict = {} # split all chains for j in self.analyzer_instance.chain_id_list: self.logger.info(f"Chain {j} info(from file not fasta): \n{self.sequences[j]}") fn = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{j}.pdb") self.analyzer_instance.split_chain(j).to_pdb(fn.as_posix()) split_dict[j]=fn.read_text() return split_dict def model_missing_loops(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> dict: mc_dict = {} if not self.missing_info: self.logger.info(f'No missing residues found in {self.pdb_file}. Skipping model_missing_loops.') return mc_dict self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}') # create workdir for mc in self.missing_info.keys(): self.logger.info(f'Building model for chain {mc}') workdir = self.pdb_file.parent.joinpath(f'./{self.pdb_id}modellerfix_{mc}') workdir.mkdir(exist_ok=True, parents=True) mc_fasta = self.analyzer_instance.find_most_similar(self.sequences[mc]) # get misschain fasta file # write fasta file mc_fasta_record = SeqRecord(mc_fasta, id=f'{mc}', description=f'{self.pdb_id}_{mc}') out_fasta_file = workdir.joinpath(f'{self.analyzer_instance.pid}_{mc}.fasta') self.analyzer_instance.write_seq_to_fasta_single_line(mc_fasta_record, out_fasta_file) self.logger.info(f'>{self.pdb_id}|{mc}|missing site:{self.missing_info[mc]}|length:{len(mc_fasta)}') self.logger.info(mc_fasta) # build model modeller = PDBModeler(self.pdb_file, out_fasta_file, workdir, mc, buildnumber, typestr) try: modeller_results = modeller.make_model() except ModellerError as mod_err: self.logger.info(f'Failed to build model for chain {mc}') self.logger.info(f'No loops detected in {out_fasta_file.name}') self.logger.info(f'may pdb file sequence is not correct') self.logger.error(f'Modeller error for chain {mc}: {mod_err}') continue except Exception as e: self.logger.error(f'Unexpected error in model_missing_loops: {e}') self.logger.info(f'Model files: {[file.name for file in modeller_results]}') # change id to original for i in modeller_results: self.change_chain_identifier(i, 'A', mc, split=False) # Use PyMOL to align and merge, controlled by use_pymol_align_merge flag if self.use_pymol_align_merge: self.logger.warning("PyMOL alignment and merging is disabled or multiple model files exist. " "This may result in clashes between multiple protein chains, leading to failures in MD simulations. " "Please ensure the structures are compatible for simulations.") if len(modeller_results) == 1: aligner = PDBAlign(self.pdb_file, modeller_results[0]) pdbstr = aligner.align() mc_dict.update({mc: pdbstr}) else: self.logger.warning('more than one model file, please set num_loop to 1') return mc_dict @staticmethod def change_chain_identifier(pdb_file: Path, chain_id:str, new_chain_id:str, split:bool = True) -> Path: o = PDBAnalyzer(pdb_file) o.change_chain_identifier(chain_id, new_chain_id, split=split).to_pdb(pdb_file) return pdb_file def run(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> Path: split_dict = self.split_all_chains() mc_dict = self.model_missing_loops(typestr=typestr, buildnumber=buildnumber) split_dict.update(mc_dict) # 更新 split_dict out_file = self.output_dir.joinpath(f'{self.analyzer_instance.pid}.modellerfix.pdb') self.import_and_merge_pdb_strings(split_dict, "merged_object", out_file.as_posix()) def import_and_merge_pdb_strings(self, pdb_strings, merged_object_name, output_file): # 使用 PyMOL 实例导入和合并 PDB for chain_id, pdb_str in pdb_strings.items(): object_name = f"chain_{chain_id}" self.pymol_instance.read_pdbstr(pdb_str, object_name) object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()] self.pymol_instance.create(merged_object_name, ' or '.join(object_names)) self.pymol_instance.save(output_file, merged_object_name) def main(): # 创建ArgumentParser对象 parser = argparse.ArgumentParser(description="Loop Model Builder for PDB files") # 添加参数 parser.add_argument("pdb_file", type=str, help="Path to the PDB file") parser.add_argument("--typestr", type=str, default="refine.very_fast", help="Model refinement type (default: refine.very_fast)") parser.add_argument("--output_dir", type=str, default=None, help="Output directory (default: same as PDB file directory)") parser.add_argument("--use_pymol_align_merge", action='store_true', help="Use PyMOL for alignment and merging. Note: This may result in clashes between multiple protein chains, leading to failures in MD simulations.") # 解析参数 args = parser.parse_args() # 调用LoopModelBuilder pdb_path = Path(args.pdb_file).absolute() output_dir = Path(args.output_dir).absolute() if args.output_dir else None model_builder = LoopModelBuilder(pdb_file=pdb_path, output_dir=output_dir, use_pymol_align_merge=args.use_pymol_align_merge) result_path = model_builder.run(typestr=args.typestr) print(f"Modeling completed. Output file: {result_path}") if __name__ == "__main__": main()