diff --git a/modelbuilder.py b/modelbuilder.py new file mode 100644 index 0000000..11ad823 --- /dev/null +++ b/modelbuilder.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :modelbuilder.py +@Description: : +@Date :2024/01/11 15:29:28 +@Author :lyzeng +@Email :pylyzeng@gmail.com +@version :1.0 +''' + +from dataclasses import dataclass, field +from pathlib import Path +import os +import logging +from analysis_pdb import PDBAnalyzer +from build_modeller import PDBModeler +from modeller import ModellerError +import pymol +from typing import Dict + +@dataclass +class PDBAlign: + template_file: Path + target_file: Path + pymol_instance: object = field(init=False) + out_file: Path = field(default=None) # 输出文件路径 + + def __post_init__(self): + self.initialize_pymol() + + def initialize_pymol(self): + self.pymol_instance = pymol.cmd + self.pymol_instance.reinitialize() + + def align(self): + self.pymol_instance.reinitialize() + # 首先,加载模板结构 + self.pymol_instance.load(self.template_file.as_posix(), "template") + + # 加载并对齐所有目标结构 + self.pymol_instance.load(self.target_file.as_posix(), "target") + self.pymol_instance.align("target", "template") + + return self.pymol_instance.get_pdbstr('target') + +@dataclass +class LoopModelBuilder: + pdb_file: Path + output_dir: Path = field(default=None) + logger: logging.Logger = field(init=False) + pymol_instance: object = field(init=False) + analyzer_instance: PDBAnalyzer = field(init=False) + runner_dir: Path = field(init=False) + pdb_id: str = field(init=False) + + def __post_init__(self): + self.pdb_id = self.pdb_file.stem + self.output_dir = self.pdb_file.parent if self.output_dir is None else self.output_dir + self.runner_dir = self.output_dir / f"runner_{self.pdb_id}" + self.analyzer_instance = PDBAnalyzer(self.pdb_file) + self.initialize_workdir() + self.setup_logging() + self.initialize_pymol() + + def setup_logging(self): + log_file = self.output_dir / "loop_modeling.log" + if log_file.exists(): + log_file.unlink() + logging.basicConfig(level=logging.INFO, filename=log_file, filemode='a', + format='%(asctime)s - %(levelname)s - %(message)s') + self.logger = logging.getLogger('LoopModelBuilder') + + def initialize_pymol(self): + self.pymol_instance = pymol.cmd + self.pymol_instance.reinitialize() + + def initialize_workdir(self): + runner_dir = self.output_dir / f"runner_{self.pdb_file.stem}" + runner_dir.mkdir(exist_ok=True, parents=True) + self.runner_dir = runner_dir + + def split_chains(self) -> dict: + split_dict = {} + for chain_id in self.analyzer_instance.chain_id_list: + chain_file = self.runner_dir / f"{self.analyzer_instance.pid}_{chain_id}.pdb" + self.analyzer_instance.split_chain(chain_id).to_pdb(chain_file.as_posix()) + split_dict[chain_id] = chain_file.read_text() + return split_dict + + def fix_loops(self, pdb_file: Path): + pdb_id = pdb_file.stem + analyzer = PDBAnalyzer(pdb_file) + missing_info = analyzer.extract_sequences_info() + split_dict = self.split_chains(analyzer, pdb_id) + mc_dict = self.model_missing_loops(analyzer, pdb_id, missing_info) + split_dict.update(mc_dict) + self.merge_and_save_pdb(split_dict, pdb_id) + + os.chdir(self.pdb_dir) # Return to the original directory + # os.chdir(runner_dir) + # self.logger.info(f"Processing PDB file: {self.pdb_file.name} in {runner_dir}") + # try: + # analyzer = PDBAnalyzer(self.pdb_file) + # missing_info = analyzer.extract_sequences_info() + # self.fix_loops(analyzer, self.pdb_file.stem, missing_info) + # except Exception as e: + # self.logger.error(f"Error processing {self.pdb_file.name}: {e}") + # finally: + # os.chdir(self.pdb_file) # Return to the original directory + + def merge_and_save_pdb(self, pdb_strings: dict, pdb_id: str): + merged_file = self.runner_dir / f"{pdb_id}_merged.pdb" + self.import_and_merge_pdb_strings(pdb_strings, "merged_object", merged_file) + + @property + def sequences(self) -> Dict: + return self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 '' + + @property + def missing_info(self) -> Dict: + return self.analyzer_instance.extract_sequences_info() + + def split_all_chains(self): + sequences = self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 '' + self.logger.info(f'Residues info for {self.pdb_file}: \n',sequences) + missing_info = self.analyzer_instance.extract_sequences_info() + self.logger.info(f'Missing residues info for {self.pdb_file}:\n {missing_info}') + split_dict = {} # split all chains + for j in self.analyzer_instance.chain_id_list: + fn = Path(f'{self.pdb_file.stem}_{j}.pdb') + self.analyzer_instance.split_chain(j).to_pdb(fn.as_posix()) + split_dict[j]=fn.read_text() + return split_dict + + def model_missing_loops(self, typestr:str = 'refine.very_fast') -> dict: + mc_dict = {} + for mc in self.missing_info: + out_file = f'5sws_{mc}.pdb' + self.analyzer_instance.split_chain(mc).to_pdb(out_file) # get misschain pdb file + mc_fasta = self.analyzer_instance.filter_sequences(mc) # get misschain fasta file + if len(mc_fasta) == 1: + mc_fasta = mc_fasta[0] + out_fasta_file = Path(f'{self.analyzer_instance.pid}_{mc}.fasta') + self.analyzer_instance.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) + self.logger.info(f'>{mc_fasta.description}') + self.logger.info(mc_fasta.seq) + modeller = PDBModeler(self.pdb_file, out_fasta_file, Path('.'), mc, 1, typestr) + try: + modeller_results = modeller.make_model() + except ModellerError as mod_err: + self.logger.info(f'Failed to build model for chain {mc}') + self.logger.info(f'No loops detected in {out_fasta_file.name}') + self.logger.info(f'may pdb file sequence is not correct') + self.logger.error(f'Modeller error for chain {mc}: {mod_err}') + continue + except Exception as e: + self.logger.error(f'Unexpected error in model_missing_loops: {e}') + self.logger.info(f'Model files: {[file.name for file in modeller_results]}') + # change id to original + for i in modeller_results: + manalyzer = PDBAnalyzer(i) + manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) + if len(modeller_results) == 1: + # use pymol to align + aligner = PDBAlign(self.pdb_file, modeller_results[0],Path(f'{self.analyzer_instance.pid}_merge_model.pdb')) + pdbstr = aligner.align() + mc_dict[mc] = pdbstr + return mc_dict + else: + print('more than one model file, please set num_loop to 1') + elif len(mc_fasta) == 0: + self.logger.warning(f'No chain {mc} found in PDB fasta file. Skipping chain {mc}.') + continue + else: + raise ValueError(f'only can fix one chain content: {mc_fasta}') + + def run(self, typestr:str = 'refine.very_fast'): + split_dict = self.split_all_chains() + mc_dict = self.model_missing_loops(typestr=typestr) + split_dict.update(mc_dict) # 更新 split_dict + self.import_and_merge_pdb_strings(split_dict, "merged_object", f'{self.analyzer_instance.pid}.modellerfix.pdb') + + def import_and_merge_pdb_strings(self, pdb_strings, merged_object_name, output_file): + # 使用 PyMOL 实例导入和合并 PDB + for chain_id, pdb_str in pdb_strings.items(): + object_name = f"chain_{chain_id}" + self.pymol_instance.read_pdbstr(pdb_str, object_name) + + object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()] + self.pymol_instance.create(merged_object_name, ' or '.join(object_names)) + self.pymol_instance.save(output_file, merged_object_name) + + +if __name__ == "__main__": + l = LoopModelBuilder(Path('./pdb_test1/1ao7.pdb')) + l.run() \ No newline at end of file