From f71d0d81a2ed311577f374cdd84d8619deb22375 Mon Sep 17 00:00:00 2001 From: hotwa Date: Mon, 4 Dec 2023 09:00:16 +0800 Subject: [PATCH] init --- analysis_pdb.py | 450 ++++++++++++++++++++++++++++++++++++++++++++++ build_modellel.py | 111 ++++++++++++ build_modeller.py | 115 ++++++++++++ 3 files changed, 676 insertions(+) create mode 100644 analysis_pdb.py create mode 100644 build_modellel.py create mode 100644 build_modeller.py diff --git a/analysis_pdb.py b/analysis_pdb.py new file mode 100644 index 0000000..9f5255a --- /dev/null +++ b/analysis_pdb.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :analysis_pdb.py +@Description: : 分析PDB结构 +@Date :2023/11/27 09:57:00 +@Author :lyzeng +@Email :pylyzeng@gmail.com +@version :1.0 +''' +# micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab +# modeller注册码:MODELIRANJE +from dataclasses import dataclass, field +from Bio.PDB import PDBParser +from Bio.SeqUtils import seq1 +from typing import List, Dict, Tuple, Optional +from functools import reduce, partial +from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure +from biopandas.pdb import PandasPdb +from pathlib import Path +from Bio.SeqRecord import SeqRecord +from Bio import SeqIO +import requests +from copy import deepcopy +from pymol import cmd + +@dataclass +class PDBAnalyzer: + """ + PDBAnalyzer class for analyzing protein structures from PDB files. + + Attributes: + pdb_file (Path): Path to the PDB file to be analyzed. + structure (object): The parsed PDB structure, initialized in __post_init__. + """ + pdb_file: Path + pid: Optional[str] = field(default=None, init=False) + structure: object = field(init=False) + biodf: PandasPdb = field(init=False) + protein_state: str = field(init=False) # Apo or Holo + chain_id_list: List[str] = field(init=False) + + def __post_init__(self): + """ + Initialize the PDB structure after the object is created. + """ + parser = PDBParser(QUIET=True) + self.pid = self.pdb_file.stem.lower() if len(self.pdb_file.stem) == 4 else None + self.structure = parser.get_structure('PDB_structure', self.pdb_file.as_posix()) + self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix()) + self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' + self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + + def check_continuity(self, chain, missing_char): + """ + Check the continuity of residues in a protein chain. + + Args: + chain (Chain): A Chain object from Bio.PDB. + missing_char (str): Character to denote missing residues. + + Returns: + List[Tuple[str, int]]: List of tuples containing residue names and their sequence numbers. + """ + def reducer(acc, residue): + # Accumulate residues, filling in gaps with the specified missing character + expected_resseq = acc[-1][1] + 1 if acc else residue.id[1] + while expected_resseq < residue.id[1]: + acc.append((missing_char, expected_resseq)) + expected_resseq += 1 + acc.append((residue.resname, residue.id[1])) + return acc + return reduce(reducer, chain, []) + + def residue_to_single_letter(self, residue_name): + """ + Convert a three-letter residue name to a single-letter code. + + Args: + residue_name (str): Three-letter residue name. + + Returns: + str: Single-letter residue code or the original string if it's a special character. + """ + # Handle special characters separately + if residue_name in {'-', 'X', ''}: + return residue_name + # Convert standard residue names to single-letter codes + return seq1(residue_name) + + def extract_sequences(self, missing_char='-') -> Dict[str, str]: + """ + Extract amino acid sequences from the structure, with gaps denoted by the specified character. + + Args: + missing_char (str): Character to use for missing residues (default is '-'). + + Returns: + Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences. + """ + # Create a continuity check function with the specified missing character + check_continuity_new = partial(self.check_continuity, missing_char=missing_char) + + sequences = {} + # Process each chain in the structure + for model in self.structure: + chains = model.get_list() + for chain in chains: + # Check continuity and get the sequence of residues + chain_sequence = check_continuity_new(chain) + # Convert to single-letter sequence + single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence) + sequences[chain.id] = single_letter_sequence + + return sequences + + def extract_sequences_info(self) -> Dict[str, List[int]]: + """ + Get missing residues information for each chain in the structure. + Utilizes map and filter for efficient processing. + + Returns: + Dict[str, List[int]]: Dictionary mapping each chain ID to a list of missing residue numbers. + """ + def find_missing(chain): + observed = [residue.get_id()[1] for residue in chain if residue.id[0] == ' '] + if observed: + full_range = range(min(observed), max(observed) + 1) + return chain.get_id(), sorted(set(full_range) - set(observed)) + return chain.get_id(), [] + + chains = [chain for model in self.structure for chain in model] + missing_info = map(find_missing, chains) + return dict(filter(lambda x: x[1], missing_info)) + + def extract_chain(self, chain_id: str): + """ + Extract a specific chain from the structure. use biopython + + Args: + chain_id (str): ID of the chain to be extracted. + + Returns: + object: An object containing the extracted chain with a save method. + """ + target_chain = next((chain for model in self.structure for chain in model if chain.id == chain_id), None) + if target_chain is None: + raise ValueError(f"Chain {chain_id} not found in the structure.") + + def save_to_file(filename: str, file_format: str = 'pdb'): + """ + Save the extracted chain to a file in the specified format. + + Args: + filename (str): Name of the file to save. + file_format (str): Format of the file ('pdb' or 'cif'). Default is 'pdb'. + """ + if file_format not in ['pdb', 'cif']: + raise ValueError("Unsupported file format. Use 'pdb' or 'cif'.") + + if file_format == 'pdb': + io = PDBIO() + elif file_format == 'cif': + io = MMCIFIO() + + io.set_structure(target_chain) + io.save(filename) + + return type('ChainExtractor', (object,), {'save': save_to_file}) + + def get_missing_residues_as_loops(self) -> Dict[str, List[Tuple[int, int, int]]]: + """ + Get missing residues information formatted as loops for PyRosetta, along with chain identifiers. + + Returns: + Dict[str, List[Tuple[int, int, int]]]: Dictionary mapping chain IDs to lists of tuples representing loop regions (start, end, cut). + """ + missing_residues = self.extract_sequences_info() + loops_by_chain = {} + for chain_id, missing_res_nums in missing_residues.items(): + loops = [] + for num in missing_res_nums: + # Define the loop region, assuming simple start and end points + loops.append((num - 1, num + 1, num)) # Example loop definition + loops_by_chain[chain_id] = loops + return loops_by_chain + + def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb: + """ + Change the identifier of a specific chain. default use biopandas , not test yet. + + Args: + old_chain_id (str): Original identifier of the chain. + new_chain_id (str): New identifier to be assigned to the chain. + return: + object: An new object,which not influce original self.biodata, containing the changed chain with a save method. + """ + biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum + biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper()) + return biodata # use to_pdb to save + + def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO: + io = PDBIO() + + def change_id(chain): + if chain.id == old_chain_id: + chain.id = new_chain_id + return chain + + if split: + # 使用 filter 和 map 创建一个只包含已更改的特定链的新结构 + new_structure = Structure.Structure("modified_structure") + for model in self.structure: + filtered_chains = filter(lambda c: c.id == old_chain_id, model) + changed_chains = map(change_id, filtered_chains) + new_model = reduce(lambda m, c: m.add(c) or m, changed_chains, Structure.Model.Model(model.id)) + new_structure.add(new_model) + io.set_structure(new_structure) + else: + # 直接在整个结构上更改ID + all_chains = (chain for model in self.structure for chain in model) + list(map(change_id, all_chains)) + io.set_structure(self.structure) + + return io + + def split_chain(self, chain_id: str) -> PandasPdb: + biodata = deepcopy(self.biodata) + df = biodata.df['ATOM'] # get ATOM colum + tmp = PandasPdb() # create empty pdb file object + tmp.df['ATOM'] = df[df['chain_id'] == chain_id.strip().upper()] + return tmp # use to_pdb method save + + def save_biodata(self, out_file: Path): + self.biodata.to_pdb(out_file.as_posix()) + + @staticmethod + def write_seq_to_fasta_single_line(seq_record: SeqRecord, output_file: Path): + with open(output_file, 'w') as output_handle: + output_handle.write(f">{seq_record.description}\n") + output_handle.write(str(seq_record.seq) + "\n") + + def download_fasta(self, pdb_id: str = None): + if pdb_id == None: + pdb_id = self.pid + if pdb_id == None: + raise ValueError("PDB ID not found.") + url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}" + response = requests.get(url) + if response.status_code == 200: + output_file = self.pdb_file.parent / f"{pdb_id}.fasta" + with open(output_file, 'w') as file: + file.write(response.text) + return output_file + else: + raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}") + + def filter_sequences(self, chain_id: str) -> List[SeqRecord]: + return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta())) + + def read_fasta(self) -> SeqRecord: + fasta_file = self.pdb_file.parent / f"{self.pid}.fasta" + if not fasta_file.exists(): + self.download_fasta() + return SeqIO.parse(fasta_file.as_posix(), "fasta") + +@dataclass +class PDBAlign: + template_file: Path + target_file: Path + out_file: Path = field(default=None) # 输出文件路径 + + def align(self): + cmd.reinitialize() + # 首先,加载模板结构 + cmd.load(self.template_file.as_posix(), "template") + + # 加载并对齐所有目标结构 + cmd.load(self.target_file.as_posix(), "target") + cmd.align("target", "template") + + return cmd.get_pdbstr('target') + +def main(PDB_ID, PDB_file_path): + # 示例 + # PDB_ID = '5sws' + # PDB_file_path = Path(f'{PDB_ID}.pdb') + analyzer = PDBAnalyzer(PDB_file_path) + sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 '' + print(f'Residues info for {PDB_ID}: \n',sequences) + missing_info = analyzer.extract_sequences_info() + print(f'Missing residues info for {PDB_ID}:\n {missing_info}') + # 示例: 使用biopython提取A链(将会保留HETATM) + chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 + # 示例: 使用biopandas提取A链(将不会保留HETATM) + chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 + # A链改B链, 并分割保存为单独文件 + analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') + # 分割所有的链 + split_dict = {} + for j in analyzer.chain_id_list: + fn = Path(f'{PDB_ID}_{j}.pdb') + analyzer.split_chain(j).to_pdb(fn.as_posix()) + split_dict[j]=fn.read_text() + # 修复loop区域 + from build_modeller import PDBModeler + from modeller import ModellerError + mc_dict = {} + for mc in missing_info: + out_file = f'5sws_{mc}.pdb' + analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file + mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file + if len(mc_fasta) == 1: + mc_fasta = mc_fasta[0] + out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') + analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) + print(f'>{mc_fasta.description}') + print(mc_fasta.seq) + modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast') + try: + modeller_results = modeller.make_model() + except ModellerError: + print(f'Failed to build model for chain {mc}') + print(f'No loops detected in {out_fasta_file.name}') + print(f'may pdb file sequence is not correct') + continue + except Exception as e: + raise e + print(f'Model files: {[file.name for file in modeller_results]}') + # change id to original + for i in modeller_results: + manalyzer = PDBAnalyzer(i) + manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) + if len(modeller_results) == 1: + # use pymol to align + aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb')) + pdbstr = aligner.align() + mc_dict[mc] = pdbstr + else: + print('more than one model file, please set num_loop to 1') + else: + raise ValueError(f'only can fix one chain content: {mc_fasta}') + + # 使用示例 + split_dict.update(mc_dict) # 更新 split_dict + import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb') + +def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file): + """ + 从 PDB 字符串导入多个对象,将它们合并,并保存为一个 PDB 文件。 + + :param pdb_strings: 字典,键为对象名称,值为 PDB 数据字符串。 + :param merged_object_name: 合并后的对象名称。 + :param output_file: 输出文件的路径。 + """ + # 初始化 PyMOL(如果在脚本或非交互式环境中使用) + cmd.reinitialize() + + # 从字符串导入每个对象 + for chain_id, pdb_str in pdb_strings.items(): + object_name = f"chain_{chain_id}" # 创建唯一的对象名称 + cmd.read_pdbstr(pdb_str, object_name) + + # 合并所有对象为一个 + object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()] + cmd.create(merged_object_name, ' or '.join(object_names)) + + # 保存合并后的对象 + cmd.save(output_file, merged_object_name) + +if __name__ == "__main__": + # import argparse + # parser = argparse.ArgumentParser(description="Build model by Modeller") + # parser.add_argument("-s", "--structure", help="Structure file") + # parser.add_argument("-o", "--outdir", help="Output directory") + # parser.add_argument("-f", "--fasta", help="Fasta file") + # parser.add_argument("-n", "--num_loop", help="Number of loop model") + # parser.add_argument("-m", "--md_level", help="MD level") + # parser.add_argument("-c", "--chain", help="Chain ID") + # args = parser.parse_args() + pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')] + for i in pdbfiles: + PDB_file_path = i + PDB_ID = i.stem + analyzer = PDBAnalyzer(PDB_file_path) + sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 '' + print(f'Residues info for {PDB_ID}: \n',sequences) + missing_info = analyzer.extract_sequences_info() + print(f'Missing residues info for {PDB_ID}:\n {missing_info}') + # 示例: 使用biopython提取A链(将会保留HETATM) + chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件 + # 示例: 使用biopandas提取A链(将不会保留HETATM) + chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A' + chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件 + # A链改B链, 并分割保存为单独文件 + analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb') + # 分割所有的链 + split_dict = {} + for j in analyzer.chain_id_list: + fn = Path(f'{PDB_ID}_{j}.pdb') + analyzer.split_chain(j).to_pdb(fn.as_posix()) + split_dict[j]=fn.read_text() + # 修复loop区域 + from build_modeller import PDBModeler + from modeller import ModellerError + mc_dict = {} + for mc in missing_info: + out_file = f'5sws_{mc}.pdb' + analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file + mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file + if len(mc_fasta) == 1: + mc_fasta = mc_fasta[0] + out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta') + analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file) + print(f'>{mc_fasta.description}') + print(mc_fasta.seq) + modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast') + try: + modeller_results = modeller.make_model() + except ModellerError: + print(f'Failed to build model for chain {mc}') + print(f'No loops detected in {out_fasta_file.name}') + print(f'may pdb file sequence is not correct') + continue + except Exception as e: + raise e + print(f'Model files: {[file.name for file in modeller_results]}') + # change id to original + for i in modeller_results: + manalyzer = PDBAnalyzer(i) + manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i) + if len(modeller_results) == 1: + # use pymol to align + aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb')) + pdbstr = aligner.align() + mc_dict[mc] = pdbstr + else: + print('more than one model file, please set num_loop to 1') + elif len(mc_fasta) == 0: + continue + else: + raise ValueError(f'only can fix one chain content: {mc_fasta}') + + # 使用示例 + split_dict.update(mc_dict) # 更新 split_dict + import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb') + diff --git a/build_modellel.py b/build_modellel.py new file mode 100644 index 0000000..3976442 --- /dev/null +++ b/build_modellel.py @@ -0,0 +1,111 @@ +from pathlib import Path +from modeller import * +from modeller.automodel import * # 加载 AutoModel 类 +import time +import os +# micromamba create -n modeller modeller biopython pymol-open-source biopandas -y -c conda-forge -c salilab +# 注册码:MODELIRANJE +# python /opt/software/docking_pipeline/scripts/protein_structure/modeller/build_modellel.py -s /mnt/AppData/task-27/ProteinCompletion/protein.pdb -f /mnt/AppData/task-27/ProteinCompletion/seq.fasta -o ./ -n 1 -m refine.fast + +def make_model(structure_file, + sequence_file, + outdir: str, + chain: str, + num_loop: int = 2, + md_level: str = 'refine.fast' + ): + + print("***************************************************") + print("md_level ====",md_level) + print("***************************************************") + + p_struct = Path(structure_file) + p_seq = Path(sequence_file) + structure = p_struct.stem + sequence = p_seq.stem + # 开始时间 + time_start = time.time() + # 对齐蛋白质结构和序列 + env1 = Environ() + # 从 PDB 文件中读取模型并将其加入对齐对象中 + mdl = Model(env1, file=structure_file, model_segment=(f'FIRST:{chain}', f'LAST:{chain}')) + # print(mdl) + aln = Alignment(env1) + # print(aln) + aln.append_model(mdl, align_codes=structure, atom_files=structure_file) + # 将序列添加到对齐对象中 + aln.append(file=sequence_file, align_codes=sequence) + # 进行 2D 对齐 + aln.align2d() + # 将对齐结果写入文件 + aln.write(file=f'{outdir}/alignment.ali', alignment_format='PIR') + aln.write(file=f'{outdir}/alignment.pap', alignment_format='PAP') + + log.verbose() + + # 重建蛋白质结构 + env2 = Environ() + # 设置输入原子文件的目录 + env2.io.atom_files_directory = ['.'] + # 生成模型,使用自动调整模型类 LoopModel + loop_model = LoopModel(env2, + alnfile=f'{outdir}/alignment.ali', + knowns=structure, + sequence=sequence, + loop_assess_methods=(assess.DOPE, assess.GA341)) + # 设置模型数量 + # loop_model.starting_model = 1 + # loop_model.ending_model = int(num_loop) + + # 设置循环模型数量 + # 数量规则:(end - start) + 1 + loop_model.loop.starting_model = 1 + loop_model.loop.ending_model = int(num_loop) + # 设置 MD 优化函数为 "refine.slow" 或 "refine.fast + if md_level.strip() == 'refine.slow': + loop_model.loop.md_level = refine.slow + elif md_level.strip() == 'refine.very_fast': + loop_model.loop.md_level = refine.very_fast + elif md_level.strip() == 'refine.fast': + loop_model.loop.md_level = refine.fast + + # 生成模型 + loop_model.make() + end_time = time.time() + print(f"Time cost: {end_time - time_start}s") + + +def fasta_to_ali(fasta_file, outdir): + if os.path.exists(outdir) is False: + os.makedirs(outdir) + + p_fasta = Path(fasta_file) + sequence = p_fasta.stem + with open(fasta_file, 'r') as f: + seq = f.readlines() + seq = seq[1].strip() + ali_file = f'{outdir}/{sequence}_full.ali' + with open(ali_file, 'w') as f: + f.write(f'>P1;{sequence}_full\n') + f.write(f'sequence:{sequence}_full:::::::0.00: 0.00\n') + f.write(f'{seq}*') + return ali_file + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Build model by Modeller") + parser.add_argument("-s", "--structure", help="Structure file") + parser.add_argument("-o", "--outdir", help="Output directory") + parser.add_argument("-f", "--fasta", help="Fasta file") + parser.add_argument("-n", "--num_loop", help="Number of loop model") + parser.add_argument("-m", "--md_level", help="MD level") + parser.add_argument("-c", "--chain", help="your fix chain ID") + args = parser.parse_args() + + sequence_file = fasta_to_ali(args.fasta, args.outdir) + + make_model(args.structure, sequence_file, + args.outdir, args.chain,args.num_loop, args.md_level) + +# python build_modellel.py -s 5sws_fixer.pdb -o ./5swsmodellerfix -f rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c A +# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c D \ No newline at end of file diff --git a/build_modeller.py b/build_modeller.py new file mode 100644 index 0000000..1b704c2 --- /dev/null +++ b/build_modeller.py @@ -0,0 +1,115 @@ +from dataclasses import dataclass, field +from pathlib import Path +from modeller import * +from modeller.automodel import * +import time +from typing import List + +@dataclass +class PDBModeler: + structure_file: Path + fasta_file: Path + outdir: Path + chain: str + num_loop: int = 2 + md_level: str = 'refine.fast' # refine.very_fast or refine.slow optional + + def __post_init__(self): + self.structure = self.structure_file.stem + self.sequence = self.fasta_file.stem + self.ali_file = self.fasta_to_ali() + + def make_model(self): + print("***************************************************") + print("md_level ====", self.md_level) + print("***************************************************") + + time_start = time.time() + + env1 = Environ() + mdl = Model(env1, file=self.structure_file.as_posix(), model_segment=(f'FIRST:{self.chain}', f'LAST:{self.chain}')) + aln = Alignment(env1) + aln.append_model(mdl, align_codes=self.structure, atom_files=self.structure_file.as_posix()) + aln.append(file=self.ali_file.as_posix(), align_codes=self.sequence) + aln.align2d() + aln.write(file=(self.outdir / 'alignment.ali').as_posix(), alignment_format='PIR') + aln.write(file=(self.outdir / 'alignment.pap').as_posix(), alignment_format='PAP') + + log.verbose() + + env2 = Environ() + env2.io.atom_files_directory = ['.'] + loop_model = LoopModel(env2, + alnfile=(self.outdir / 'alignment.ali').as_posix(), + knowns=self.structure, + sequence=self.sequence, + loop_assess_methods=(assess.DOPE, assess.GA341)) + # 设置循环模型数量 + # 数量规则:(end - start) + 1 + loop_model.loop.starting_model = 1 + loop_model.loop.ending_model = self.num_loop + # 设置 MD 优化函数为 "refine.slow" 或 "refine.fast + if self.md_level.strip() == 'refine.slow': + loop_model.loop.md_level = refine.slow + elif self.md_level.strip() == 'refine.very_fast': + loop_model.loop.md_level = refine.very_fast + elif self.md_level.strip() == 'refine.fast': + loop_model.loop.md_level = refine.fast + + # 调用 LoopModel 的 make 方法 + loop_model.make() + end_time = time.time() + print(f"Time cost: {end_time - time_start}s") + + # 获取所有成功生成的模型文件的路径 + model_files = self.get_model_files(loop_model) + if model_files: + print(f"Model files: {[file.name for file in model_files]}") + else: + print("No model files found.") + + return model_files + + def get_model_files(self, loop_model) -> List[Path]: + # 检查 loop_model.loop.outputs 列表,收集所有成功生成的模型文件 + model_files = [] + for output in loop_model.loop.outputs: + if output.get('failure') is None: + model_files.append(Path(output.get('name'))) + return model_files + + def fasta_to_ali(self) -> Path: + if not self.outdir.exists(): + self.outdir.mkdir(parents=True, exist_ok=True) + + ali_file = self.outdir / f'{self.sequence}.ali' + if ali_file.exists(): + ali_file.unlink() + + with open(self.fasta_file, 'r') as f: + seq = f.readlines()[1].strip() + + with open(ali_file, 'w') as f: + f.write(f'>P1;{self.sequence}\n') + f.write(f'sequence:{self.sequence}:::::::0.00: 0.00\n') + f.write(f'{seq}*') + + return ali_file + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Build model by Modeller") + parser.add_argument("-s", "--structure", help="Structure file") + parser.add_argument("-o", "--outdir", help="Output directory") + parser.add_argument("-f", "--fasta", help="Fasta file") + parser.add_argument("-n", "--num_loop", help="Number of loop model") + parser.add_argument("-m", "--md_level", help="MD level") + parser.add_argument("-c", "--chain", help="Chain ID") + args = parser.parse_args() + + modeler = PDBModeler(Path(args.structure), Path(args.fasta), Path(args.outdir), + args.chain, int(args.num_loop), args.md_level) + modeler.make_model() + +# test command +# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.very_fast -c D \ No newline at end of file