Files
analysis_pdb/modelbuilder.py
2024-01-11 18:09:14 +08:00

197 lines
8.4 KiB
Python

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :modelbuilder.py
@Description: :
@Date :2024/01/11 15:29:28
@Author :lyzeng
@Email :pylyzeng@gmail.com
@version :1.0
'''
from dataclasses import dataclass, field
from pathlib import Path
import os
import logging
from analysis_pdb import PDBAnalyzer
from build_modeller import PDBModeler
from modeller import ModellerError
import pymol
from typing import Dict
@dataclass
class PDBAlign:
template_file: Path
target_file: Path
pymol_instance: object = field(init=False)
out_file: Path = field(default=None) # 输出文件路径
def __post_init__(self):
self.initialize_pymol()
def initialize_pymol(self):
self.pymol_instance = pymol.cmd
self.pymol_instance.reinitialize()
def align(self):
self.pymol_instance.reinitialize()
# 首先,加载模板结构
self.pymol_instance.load(self.template_file.as_posix(), "template")
# 加载并对齐所有目标结构
self.pymol_instance.load(self.target_file.as_posix(), "target")
self.pymol_instance.align("target", "template")
return self.pymol_instance.get_pdbstr('target')
@dataclass
class LoopModelBuilder:
pdb_file: Path
output_dir: Path = field(default=None)
logger: logging.Logger = field(init=False)
pymol_instance: object = field(init=False)
analyzer_instance: PDBAnalyzer = field(init=False)
runner_dir: Path = field(init=False)
pdb_id: str = field(init=False)
def __post_init__(self):
self.pdb_id = self.pdb_file.stem
self.output_dir = self.pdb_file.parent if self.output_dir is None else self.output_dir
self.runner_dir = self.output_dir / f"runner_{self.pdb_id}"
self.analyzer_instance = PDBAnalyzer(self.pdb_file)
self.initialize_workdir()
self.setup_logging()
self.initialize_pymol()
def setup_logging(self):
log_file = self.output_dir / "loop_modeling.log"
if log_file.exists():
log_file.unlink()
logging.basicConfig(level=logging.INFO, filename=log_file, filemode='a',
format='%(asctime)s - %(levelname)s - %(message)s')
self.logger = logging.getLogger('LoopModelBuilder')
def initialize_pymol(self):
self.pymol_instance = pymol.cmd
self.pymol_instance.reinitialize()
def initialize_workdir(self):
runner_dir = self.output_dir / f"runner_{self.pdb_file.stem}"
runner_dir.mkdir(exist_ok=True, parents=True)
self.runner_dir = runner_dir
def split_chains(self) -> dict:
split_dict = {}
for chain_id in self.analyzer_instance.chain_id_list:
chain_file = self.runner_dir / f"{self.analyzer_instance.pid}_{chain_id}.pdb"
self.analyzer_instance.split_chain(chain_id).to_pdb(chain_file.as_posix())
split_dict[chain_id] = chain_file.read_text()
return split_dict
def fix_loops(self, pdb_file: Path):
pdb_id = pdb_file.stem
analyzer = PDBAnalyzer(pdb_file)
missing_info = analyzer.extract_sequences_info()
split_dict = self.split_chains(analyzer, pdb_id)
mc_dict = self.model_missing_loops(analyzer, pdb_id, missing_info)
split_dict.update(mc_dict)
self.merge_and_save_pdb(split_dict, pdb_id)
os.chdir(self.pdb_dir) # Return to the original directory
# os.chdir(runner_dir)
# self.logger.info(f"Processing PDB file: {self.pdb_file.name} in {runner_dir}")
# try:
# analyzer = PDBAnalyzer(self.pdb_file)
# missing_info = analyzer.extract_sequences_info()
# self.fix_loops(analyzer, self.pdb_file.stem, missing_info)
# except Exception as e:
# self.logger.error(f"Error processing {self.pdb_file.name}: {e}")
# finally:
# os.chdir(self.pdb_file) # Return to the original directory
def merge_and_save_pdb(self, pdb_strings: dict, pdb_id: str):
merged_file = self.runner_dir / f"{pdb_id}_merged.pdb"
self.import_and_merge_pdb_strings(pdb_strings, "merged_object", merged_file)
@property
def sequences(self) -> Dict:
return self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
@property
def missing_info(self) -> Dict:
return self.analyzer_instance.extract_sequences_info()
def split_all_chains(self):
sequences = self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
self.logger.info(f'Residues info for {self.pdb_file}: \n',sequences)
missing_info = self.analyzer_instance.extract_sequences_info()
self.logger.info(f'Missing residues info for {self.pdb_file}:\n {missing_info}')
split_dict = {} # split all chains
for j in self.analyzer_instance.chain_id_list:
fn = Path(f'{self.pdb_file.stem}_{j}.pdb')
self.analyzer_instance.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
return split_dict
def model_missing_loops(self, typestr:str = 'refine.very_fast') -> dict:
mc_dict = {}
for mc in self.missing_info:
out_file = f'5sws_{mc}.pdb'
self.analyzer_instance.split_chain(mc).to_pdb(out_file) # get misschain pdb file
mc_fasta = self.analyzer_instance.filter_sequences(mc) # get misschain fasta file
if len(mc_fasta) == 1:
mc_fasta = mc_fasta[0]
out_fasta_file = Path(f'{self.analyzer_instance.pid}_{mc}.fasta')
self.analyzer_instance.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
self.logger.info(f'>{mc_fasta.description}')
self.logger.info(mc_fasta.seq)
modeller = PDBModeler(self.pdb_file, out_fasta_file, Path('.'), mc, 1, typestr)
try:
modeller_results = modeller.make_model()
except ModellerError as mod_err:
self.logger.info(f'Failed to build model for chain {mc}')
self.logger.info(f'No loops detected in {out_fasta_file.name}')
self.logger.info(f'may pdb file sequence is not correct')
self.logger.error(f'Modeller error for chain {mc}: {mod_err}')
continue
except Exception as e:
self.logger.error(f'Unexpected error in model_missing_loops: {e}')
self.logger.info(f'Model files: {[file.name for file in modeller_results]}')
# change id to original
for i in modeller_results:
manalyzer = PDBAnalyzer(i)
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
if len(modeller_results) == 1:
# use pymol to align
aligner = PDBAlign(self.pdb_file, modeller_results[0],Path(f'{self.analyzer_instance.pid}_merge_model.pdb'))
pdbstr = aligner.align()
mc_dict[mc] = pdbstr
return mc_dict
else:
print('more than one model file, please set num_loop to 1')
elif len(mc_fasta) == 0:
self.logger.warning(f'No chain {mc} found in PDB fasta file. Skipping chain {mc}.')
continue
else:
raise ValueError(f'only can fix one chain content: {mc_fasta}')
def run(self, typestr:str = 'refine.very_fast'):
split_dict = self.split_all_chains()
mc_dict = self.model_missing_loops(typestr=typestr)
split_dict.update(mc_dict) # 更新 split_dict
self.import_and_merge_pdb_strings(split_dict, "merged_object", f'{self.analyzer_instance.pid}.modellerfix.pdb')
def import_and_merge_pdb_strings(self, pdb_strings, merged_object_name, output_file):
# 使用 PyMOL 实例导入和合并 PDB
for chain_id, pdb_str in pdb_strings.items():
object_name = f"chain_{chain_id}"
self.pymol_instance.read_pdbstr(pdb_str, object_name)
object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()]
self.pymol_instance.create(merged_object_name, ' or '.join(object_names))
self.pymol_instance.save(output_file, merged_object_name)
if __name__ == "__main__":
l = LoopModelBuilder(Path('./pdb_test1/1ao7.pdb'))
l.run()