218 lines
9.2 KiB
Python
218 lines
9.2 KiB
Python
#!/usr/bin/env python
|
|
# -*- encoding: utf-8 -*-
|
|
'''
|
|
@file :modelbuilder.py
|
|
@Description: :
|
|
@Date :2024/01/11 15:29:28
|
|
@Author :lyzeng
|
|
@Email :pylyzeng@gmail.com
|
|
@version :1.0
|
|
'''
|
|
import argparse
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
import os
|
|
import logging
|
|
from analysis_pdb import PDBAnalyzer
|
|
from build_modeller import PDBModeler
|
|
from modeller import ModellerError
|
|
import pymol
|
|
from typing import Dict
|
|
from Bio.SeqRecord import SeqRecord
|
|
from concurrent.futures import ProcessPoolExecutor
|
|
from functools import partial
|
|
|
|
@dataclass
|
|
class PDBAlign:
|
|
template_file: Path
|
|
target_file: Path
|
|
pymol_instance: object = field(init=False)
|
|
out_file: Path = field(default=None) # 输出文件路径
|
|
|
|
def __post_init__(self):
|
|
self.initialize_pymol()
|
|
|
|
def initialize_pymol(self):
|
|
self.pymol_instance = pymol.cmd
|
|
self.pymol_instance.reinitialize()
|
|
|
|
def align(self) -> str:
|
|
self.pymol_instance.reinitialize()
|
|
# 首先,加载模板结构
|
|
self.pymol_instance.load(self.template_file.as_posix(), "template")
|
|
|
|
# 加载并对齐所有目标结构
|
|
self.pymol_instance.load(self.target_file.as_posix(), "target")
|
|
self.pymol_instance.align("target", "template")
|
|
|
|
return self.pymol_instance.get_pdbstr('target')
|
|
|
|
def save(self, out_file: Path):
|
|
self.pymol_instance.save(out_file.as_posix(), "target")
|
|
|
|
@dataclass
|
|
class LoopModelBuilder:
|
|
pdb_file: Path
|
|
output_dir: Path = field(default=None)
|
|
logger: logging.Logger = field(init=False)
|
|
pymol_instance: object = field(init=False)
|
|
analyzer_instance: PDBAnalyzer = field(init=False)
|
|
pdb_id: str = field(init=False)
|
|
|
|
def __post_init__(self):
|
|
self.pdb_id = self.pdb_file.stem
|
|
self.output_dir = self.pdb_file.parent if self.output_dir is None else self.output_dir
|
|
self.analyzer_instance = PDBAnalyzer(self.pdb_file)
|
|
self.setup_logging()
|
|
self.initialize_pymol()
|
|
|
|
def setup_logging(self):
|
|
log_file = self.output_dir / "loop_modeling.log"
|
|
if log_file.exists():
|
|
log_file.unlink()
|
|
|
|
# 创建 logger 实例
|
|
self.logger = logging.getLogger('LoopModelBuilder')
|
|
self.logger.setLevel(logging.INFO)
|
|
|
|
# 创建 file handler
|
|
file_handler = logging.FileHandler(log_file, mode='a')
|
|
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
|
|
# 创建 console handler
|
|
console_handler = logging.StreamHandler()
|
|
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
|
|
|
|
# 将 handlers 添加到 logger
|
|
self.logger.addHandler(file_handler)
|
|
self.logger.addHandler(console_handler)
|
|
|
|
|
|
def initialize_pymol(self):
|
|
self.pymol_instance = pymol.cmd
|
|
self.pymol_instance.reinitialize()
|
|
|
|
def split_chains(self) -> dict:
|
|
split_dict = {}
|
|
for chain_id in self.analyzer_instance.chain_id_list:
|
|
self.logger.info(f"Chain {chain_id} info(from file not fasta): \n{self.sequences[chain_id]}")
|
|
chain_file = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{chain_id}.pdb")
|
|
self.analyzer_instance.split_chain(chain_id).to_pdb(chain_file.as_posix())
|
|
split_dict[chain_id] = chain_file.read_text()
|
|
return split_dict
|
|
|
|
def merge_and_save_pdb(self, pdb_strings: dict, pdb_id: str):
|
|
merged_file = Path(f"{pdb_id}_merged.pdb")
|
|
self.import_and_merge_pdb_strings(pdb_strings, "merged_object", merged_file)
|
|
|
|
@property
|
|
def sequences(self) -> Dict:
|
|
return self.analyzer_instance.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
|
|
|
|
@property
|
|
def missing_info(self) -> Dict:
|
|
return self.analyzer_instance.extract_sequences_info()
|
|
|
|
def split_all_chains(self):
|
|
self.logger.info(f'Residues info for {self.pdb_file}: \n',self.sequences)
|
|
self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}')
|
|
split_dict = {} # split all chains
|
|
for j in self.analyzer_instance.chain_id_list:
|
|
self.logger.info(f"Chain {j} info(from file not fasta): \n{self.sequences[j]}")
|
|
fn = self.output_dir.joinpath(f"{self.analyzer_instance.pid}_{j}.pdb")
|
|
self.analyzer_instance.split_chain(j).to_pdb(fn.as_posix())
|
|
split_dict[j]=fn.read_text()
|
|
return split_dict
|
|
|
|
def model_missing_loops(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> dict:
|
|
mc_dict = {}
|
|
if not self.missing_info:
|
|
self.logger.info(f'No missing residues found in {self.pdb_file}. Skipping model_missing_loops.')
|
|
return mc_dict
|
|
self.logger.info(f'Missing residues info for {self.pdb_file}:\n {self.missing_info}')
|
|
# create workdir
|
|
for mc in self.missing_info.keys():
|
|
self.logger.info(f'Building model for chain {mc}')
|
|
workdir = self.pdb_file.parent.joinpath(f'./{self.pdb_id}modellerfix_{mc}')
|
|
workdir.mkdir(exist_ok=True, parents=True)
|
|
mc_fasta = self.analyzer_instance.find_most_similar(self.sequences[mc]) # get misschain fasta file
|
|
# write fasta file
|
|
mc_fasta_record = SeqRecord(mc_fasta, id=f'{mc}', description=f'{self.pdb_id}_{mc}')
|
|
out_fasta_file = workdir.joinpath(f'{self.analyzer_instance.pid}_{mc}.fasta')
|
|
self.analyzer_instance.write_seq_to_fasta_single_line(mc_fasta_record, out_fasta_file)
|
|
self.logger.info(f'>{self.pdb_id}|{mc}|missing site:{self.missing_info[mc]}|length:{len(mc_fasta)}')
|
|
self.logger.info(mc_fasta)
|
|
# build model
|
|
modeller = PDBModeler(self.pdb_file, out_fasta_file, workdir, mc, buildnumber, typestr)
|
|
try:
|
|
modeller_results = modeller.make_model()
|
|
except ModellerError as mod_err:
|
|
self.logger.info(f'Failed to build model for chain {mc}')
|
|
self.logger.info(f'No loops detected in {out_fasta_file.name}')
|
|
self.logger.info(f'may pdb file sequence is not correct')
|
|
self.logger.error(f'Modeller error for chain {mc}: {mod_err}')
|
|
continue
|
|
except Exception as e:
|
|
self.logger.error(f'Unexpected error in model_missing_loops: {e}')
|
|
self.logger.info(f'Model files: {[file.name for file in modeller_results]}')
|
|
# change id to original
|
|
for i in modeller_results:
|
|
self.change_chain_identifier(i, 'A', mc, split=False)
|
|
# use pymol to align and merge
|
|
if len(modeller_results) == 1:
|
|
# use pymol to align
|
|
aligner = PDBAlign(self.pdb_file, modeller_results[0])
|
|
pdbstr = aligner.align()
|
|
mc_dict.update({mc: pdbstr})
|
|
else:
|
|
self.logger.warning('more than one model file, please set num_loop to 1')
|
|
return mc_dict
|
|
|
|
@staticmethod
|
|
def change_chain_identifier(pdb_file: Path, chain_id:str, new_chain_id:str, split:bool = True) -> Path:
|
|
o = PDBAnalyzer(pdb_file)
|
|
o.change_chain_identifier(chain_id, new_chain_id, split=split).to_pdb(pdb_file)
|
|
return pdb_file
|
|
|
|
def run(self, typestr:str = 'refine.very_fast', buildnumber: str = 1) -> Path:
|
|
split_dict = self.split_all_chains()
|
|
mc_dict = self.model_missing_loops(typestr=typestr, buildnumber=buildnumber)
|
|
split_dict.update(mc_dict) # 更新 split_dict
|
|
out_file = self.output_dir.joinpath(f'{self.analyzer_instance.pid}.modellerfix.pdb')
|
|
self.import_and_merge_pdb_strings(split_dict, "merged_object", out_file.as_posix())
|
|
|
|
def import_and_merge_pdb_strings(self, pdb_strings, merged_object_name, output_file):
|
|
# 使用 PyMOL 实例导入和合并 PDB
|
|
for chain_id, pdb_str in pdb_strings.items():
|
|
object_name = f"chain_{chain_id}"
|
|
self.pymol_instance.read_pdbstr(pdb_str, object_name)
|
|
|
|
object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()]
|
|
self.pymol_instance.create(merged_object_name, ' or '.join(object_names))
|
|
self.pymol_instance.save(output_file, merged_object_name)
|
|
|
|
def main():
|
|
# 创建ArgumentParser对象
|
|
parser = argparse.ArgumentParser(description="Loop Model Builder for PDB files")
|
|
# 添加参数
|
|
parser.add_argument("pdb_file", type=str, help="Path to the PDB file")
|
|
parser.add_argument("--typestr", type=str, default="refine.very_fast",
|
|
help="Model refinement type (default: refine.very_fast)")
|
|
parser.add_argument("--output_dir", type=str, default=None,
|
|
help="Output directory (default: same as PDB file directory)")
|
|
|
|
# 解析参数
|
|
args = parser.parse_args()
|
|
|
|
# 调用LoopModelBuilder
|
|
pdb_path = Path(args.pdb_file).absolute()
|
|
output_dir = Path(args.output_dir).absolute() if args.output_dir else None
|
|
|
|
model_builder = LoopModelBuilder(pdb_file=pdb_path, output_dir=output_dir)
|
|
result_path = model_builder.run(typestr=args.typestr)
|
|
|
|
print(f"Modeling completed. Output file: {result_path}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|