init

2023-12-04 09:00:16 +08:00
commit f71d0d81a2
3 changed files with 676 additions and 0 deletions
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :analysis_pdb.py
+@Description:       : 分析PDB结构
+@Date               :2023/11/27 09:57:00
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+'''
+# micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab
+# modeller注册码：MODELIRANJE
+from dataclasses import dataclass, field
+from Bio.PDB import PDBParser
+from Bio.SeqUtils import seq1
+from typing import List, Dict, Tuple, Optional
+from functools import reduce, partial
+from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure
+from biopandas.pdb import PandasPdb
+from pathlib import Path
+from Bio.SeqRecord import SeqRecord
+from Bio import SeqIO
+import requests
+from copy import deepcopy
+from pymol import cmd
+
+@dataclass
+class PDBAnalyzer:
+    """ 
+    PDBAnalyzer class for analyzing protein structures from PDB files.
+
+    Attributes:
+    pdb_file (Path): Path to the PDB file to be analyzed.
+    structure (object): The parsed PDB structure, initialized in __post_init__.
+    """
+    pdb_file: Path
+    pid: Optional[str] = field(default=None, init=False)
+    structure: object = field(init=False)
+    biodf: PandasPdb = field(init=False)
+    protein_state: str = field(init=False) # Apo or Holo
+    chain_id_list: List[str] = field(init=False)
+
+    def __post_init__(self):
+        """ 
+        Initialize the PDB structure after the object is created.
+        """
+        parser = PDBParser(QUIET=True)
+        self.pid = self.pdb_file.stem.lower() if len(self.pdb_file.stem) == 4 else None
+        self.structure = parser.get_structure('PDB_structure', self.pdb_file.as_posix())
+        self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
+        self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
+        self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
+
+    def check_continuity(self, chain, missing_char):
+        """ 
+        Check the continuity of residues in a protein chain.
+
+        Args:
+        chain (Chain): A Chain object from Bio.PDB.
+        missing_char (str): Character to denote missing residues.
+
+        Returns:
+        List[Tuple[str, int]]: List of tuples containing residue names and their sequence numbers.
+        """
+        def reducer(acc, residue):
+            # Accumulate residues, filling in gaps with the specified missing character
+            expected_resseq = acc[-1][1] + 1 if acc else residue.id[1]
+            while expected_resseq < residue.id[1]:
+                acc.append((missing_char, expected_resseq))
+                expected_resseq += 1
+            acc.append((residue.resname, residue.id[1]))
+            return acc
+        return reduce(reducer, chain, [])
+
+    def residue_to_single_letter(self, residue_name):
+        """ 
+        Convert a three-letter residue name to a single-letter code.
+
+        Args:
+        residue_name (str): Three-letter residue name.
+
+        Returns:
+        str: Single-letter residue code or the original string if it's a special character.
+        """
+        # Handle special characters separately
+        if residue_name in {'-', 'X', ''}:
+            return residue_name
+        # Convert standard residue names to single-letter codes
+        return seq1(residue_name)
+
+    def extract_sequences(self, missing_char='-') -> Dict[str, str]:
+        """ 
+        Extract amino acid sequences from the structure, with gaps denoted by the specified character.
+
+        Args:
+        missing_char (str): Character to use for missing residues (default is '-').
+
+        Returns:
+        Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
+        """
+        # Create a continuity check function with the specified missing character
+        check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
+
+        sequences = {}
+        # Process each chain in the structure
+        for model in self.structure:
+            chains = model.get_list()
+            for chain in chains:
+                # Check continuity and get the sequence of residues
+                chain_sequence = check_continuity_new(chain)
+                # Convert to single-letter sequence
+                single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
+                sequences[chain.id] = single_letter_sequence
+
+        return sequences
+    
+    def extract_sequences_info(self) -> Dict[str, List[int]]:
+        """
+        Get missing residues information for each chain in the structure.
+        Utilizes map and filter for efficient processing.
+
+        Returns:
+        Dict[str, List[int]]: Dictionary mapping each chain ID to a list of missing residue numbers.
+        """
+        def find_missing(chain):
+            observed = [residue.get_id()[1] for residue in chain if residue.id[0] == ' ']
+            if observed:
+                full_range = range(min(observed), max(observed) + 1)
+                return chain.get_id(), sorted(set(full_range) - set(observed))
+            return chain.get_id(), []
+
+        chains = [chain for model in self.structure for chain in model]
+        missing_info = map(find_missing, chains)
+        return dict(filter(lambda x: x[1], missing_info))
+    
+    def extract_chain(self, chain_id: str):
+        """
+        Extract a specific chain from the structure. use biopython
+
+        Args:
+        chain_id (str): ID of the chain to be extracted.
+
+        Returns:
+        object: An object containing the extracted chain with a save method.
+        """
+        target_chain = next((chain for model in self.structure for chain in model if chain.id == chain_id), None)
+        if target_chain is None:
+            raise ValueError(f"Chain {chain_id} not found in the structure.")
+
+        def save_to_file(filename: str, file_format: str = 'pdb'):
+            """
+            Save the extracted chain to a file in the specified format.
+
+            Args:
+            filename (str): Name of the file to save.
+            file_format (str): Format of the file ('pdb' or 'cif'). Default is 'pdb'.
+            """
+            if file_format not in ['pdb', 'cif']:
+                raise ValueError("Unsupported file format. Use 'pdb' or 'cif'.")
+
+            if file_format == 'pdb':
+                io = PDBIO()
+            elif file_format == 'cif':
+                io = MMCIFIO()
+
+            io.set_structure(target_chain)
+            io.save(filename)
+
+        return type('ChainExtractor', (object,), {'save': save_to_file})
+
+    def get_missing_residues_as_loops(self) -> Dict[str, List[Tuple[int, int, int]]]:
+        """
+        Get missing residues information formatted as loops for PyRosetta, along with chain identifiers.
+
+        Returns:
+        Dict[str, List[Tuple[int, int, int]]]: Dictionary mapping chain IDs to lists of tuples representing loop regions (start, end, cut).
+        """
+        missing_residues = self.extract_sequences_info()
+        loops_by_chain = {}
+        for chain_id, missing_res_nums in missing_residues.items():
+            loops = []
+            for num in missing_res_nums:
+                # Define the loop region, assuming simple start and end points
+                loops.append((num - 1, num + 1, num))  # Example loop definition
+            loops_by_chain[chain_id] = loops
+        return loops_by_chain
+    
+    def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb:
+        """
+        Change the identifier of a specific chain. default use biopandas , not test yet. 
+
+        Args:
+        old_chain_id (str): Original identifier of the chain.
+        new_chain_id (str): New identifier to be assigned to the chain.
+        return:
+        object: An new object,which not influce original self.biodata, containing the changed chain with a save method.
+        """
+        biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum 
+        biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper())
+        return biodata # use to_pdb to save
+    
+    def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO:
+        io = PDBIO()
+
+        def change_id(chain):
+            if chain.id == old_chain_id:
+                chain.id = new_chain_id
+            return chain
+
+        if split:
+            # 使用 filter 和 map 创建一个只包含已更改的特定链的新结构
+            new_structure = Structure.Structure("modified_structure")
+            for model in self.structure:
+                filtered_chains = filter(lambda c: c.id == old_chain_id, model)
+                changed_chains = map(change_id, filtered_chains)
+                new_model = reduce(lambda m, c: m.add(c) or m, changed_chains, Structure.Model.Model(model.id))
+                new_structure.add(new_model)
+            io.set_structure(new_structure)
+        else:
+            # 直接在整个结构上更改ID
+            all_chains = (chain for model in self.structure for chain in model)
+            list(map(change_id, all_chains))
+            io.set_structure(self.structure)
+
+        return io
+        
+    def split_chain(self, chain_id: str) -> PandasPdb:
+        biodata = deepcopy(self.biodata)
+        df = biodata.df['ATOM'] # get ATOM colum
+        tmp = PandasPdb() # create empty pdb file object
+        tmp.df['ATOM'] = df[df['chain_id'] == chain_id.strip().upper()]
+        return tmp # use to_pdb method save
+        
+    def save_biodata(self, out_file: Path):
+        self.biodata.to_pdb(out_file.as_posix())
+        
+    @staticmethod
+    def write_seq_to_fasta_single_line(seq_record: SeqRecord, output_file: Path):
+        with open(output_file, 'w') as output_handle:
+            output_handle.write(f">{seq_record.description}\n")
+            output_handle.write(str(seq_record.seq) + "\n")
+
+    def download_fasta(self, pdb_id: str = None):
+        if pdb_id == None:
+            pdb_id = self.pid
+        if pdb_id == None:
+            raise ValueError("PDB ID not found.")
+        url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}"
+        response = requests.get(url)
+        if response.status_code == 200:
+            output_file = self.pdb_file.parent / f"{pdb_id}.fasta"
+            with open(output_file, 'w') as file:
+                file.write(response.text)
+            return output_file
+        else:
+            raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
+
+    def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
+        return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
+    
+    def read_fasta(self) -> SeqRecord:
+        fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
+        if not fasta_file.exists():
+            self.download_fasta()
+        return SeqIO.parse(fasta_file.as_posix(), "fasta")
+
+@dataclass
+class PDBAlign:
+    template_file: Path
+    target_file: Path  
+    out_file: Path = field(default=None)  # 输出文件路径
+
+    def align(self):
+        cmd.reinitialize()
+        # 首先，加载模板结构
+        cmd.load(self.template_file.as_posix(), "template")
+
+        # 加载并对齐所有目标结构
+        cmd.load(self.target_file.as_posix(), "target")
+        cmd.align("target", "template")
+
+        return cmd.get_pdbstr('target')
+
+def main(PDB_ID, PDB_file_path):
+    # 示例
+    # PDB_ID = '5sws'
+    # PDB_file_path = Path(f'{PDB_ID}.pdb')
+    analyzer = PDBAnalyzer(PDB_file_path)
+    sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
+    print(f'Residues info for {PDB_ID}: \n',sequences)
+    missing_info = analyzer.extract_sequences_info()
+    print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
+    # 示例: 使用biopython提取A链（将会保留HETATM）
+    chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
+    chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
+    # 示例: 使用biopandas提取A链（将不会保留HETATM）
+    chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
+    chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
+    # A链改B链, 并分割保存为单独文件
+    analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
+    # 分割所有的链
+    split_dict = {}
+    for j in analyzer.chain_id_list:
+        fn = Path(f'{PDB_ID}_{j}.pdb')
+        analyzer.split_chain(j).to_pdb(fn.as_posix())
+        split_dict[j]=fn.read_text()
+    # 修复loop区域
+    from build_modeller import PDBModeler
+    from modeller import ModellerError
+    mc_dict = {}
+    for mc in missing_info:
+        out_file = f'5sws_{mc}.pdb'
+        analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
+        mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
+        if len(mc_fasta) == 1:
+            mc_fasta = mc_fasta[0]
+            out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
+            analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
+            print(f'>{mc_fasta.description}')
+            print(mc_fasta.seq)
+            modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
+            try:
+                modeller_results = modeller.make_model()
+            except ModellerError:
+                print(f'Failed to build model for chain {mc}')
+                print(f'No loops detected in {out_fasta_file.name}')
+                print(f'may pdb file sequence is not correct')
+                continue
+            except Exception as e:
+                raise e
+            print(f'Model files: {[file.name for file in modeller_results]}')
+            # change id to original
+            for i in modeller_results:
+                manalyzer = PDBAnalyzer(i)
+                manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
+            if len(modeller_results) == 1:
+                # use pymol to align
+                aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
+                pdbstr = aligner.align()
+                mc_dict[mc] = pdbstr
+            else:
+                print('more than one model file, please set num_loop to 1')
+        else:
+            raise ValueError(f'only can fix one chain content: {mc_fasta}')
+
+    # 使用示例
+    split_dict.update(mc_dict)  # 更新 split_dict
+    import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
+
+def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
+    """
+    从 PDB 字符串导入多个对象，将它们合并，并保存为一个 PDB 文件。
+
+    :param pdb_strings: 字典，键为对象名称，值为 PDB 数据字符串。
+    :param merged_object_name: 合并后的对象名称。
+    :param output_file: 输出文件的路径。
+    """
+    # 初始化 PyMOL（如果在脚本或非交互式环境中使用）
+    cmd.reinitialize()
+
+    # 从字符串导入每个对象
+    for chain_id, pdb_str in pdb_strings.items():
+        object_name = f"chain_{chain_id}"  # 创建唯一的对象名称
+        cmd.read_pdbstr(pdb_str, object_name)
+
+    # 合并所有对象为一个
+    object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()]
+    cmd.create(merged_object_name, ' or '.join(object_names))
+
+    # 保存合并后的对象
+    cmd.save(output_file, merged_object_name)
+
+if __name__ == "__main__":
+    # import argparse
+    # parser = argparse.ArgumentParser(description="Build model by Modeller")
+    # parser.add_argument("-s", "--structure", help="Structure file")
+    # parser.add_argument("-o", "--outdir", help="Output directory")
+    # parser.add_argument("-f", "--fasta", help="Fasta file")
+    # parser.add_argument("-n", "--num_loop", help="Number of loop model")
+    # parser.add_argument("-m", "--md_level", help="MD level")
+    # parser.add_argument("-c", "--chain", help="Chain ID")
+    # args = parser.parse_args()
+    pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')]
+    for i in pdbfiles:
+            PDB_file_path = i
+            PDB_ID = i.stem
+            analyzer = PDBAnalyzer(PDB_file_path)
+            sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
+            print(f'Residues info for {PDB_ID}: \n',sequences)
+            missing_info = analyzer.extract_sequences_info()
+            print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
+            # 示例: 使用biopython提取A链（将会保留HETATM）
+            chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
+            chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
+            # 示例: 使用biopandas提取A链（将不会保留HETATM）
+            chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
+            chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
+            # A链改B链, 并分割保存为单独文件
+            analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
+            # 分割所有的链
+            split_dict = {}
+            for j in analyzer.chain_id_list:
+                fn = Path(f'{PDB_ID}_{j}.pdb')
+                analyzer.split_chain(j).to_pdb(fn.as_posix())
+                split_dict[j]=fn.read_text()
+            # 修复loop区域
+            from build_modeller import PDBModeler
+            from modeller import ModellerError
+            mc_dict = {}
+            for mc in missing_info:
+                out_file = f'5sws_{mc}.pdb'
+                analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
+                mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
+                if len(mc_fasta) == 1:
+                    mc_fasta = mc_fasta[0]
+                    out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
+                    analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
+                    print(f'>{mc_fasta.description}')
+                    print(mc_fasta.seq)
+                    modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
+                    try:
+                        modeller_results = modeller.make_model()
+                    except ModellerError:
+                        print(f'Failed to build model for chain {mc}')
+                        print(f'No loops detected in {out_fasta_file.name}')
+                        print(f'may pdb file sequence is not correct')
+                        continue
+                    except Exception as e:
+                        raise e
+                    print(f'Model files: {[file.name for file in modeller_results]}')
+                    # change id to original
+                    for i in modeller_results:
+                        manalyzer = PDBAnalyzer(i)
+                        manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
+                    if len(modeller_results) == 1:
+                        # use pymol to align
+                        aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
+                        pdbstr = aligner.align()
+                        mc_dict[mc] = pdbstr
+                    else:
+                        print('more than one model file, please set num_loop to 1')
+                elif len(mc_fasta) == 0:
+                    continue
+                else:
+                    raise ValueError(f'only can fix one chain content: {mc_fasta}')
+
+            # 使用示例
+            split_dict.update(mc_dict)  # 更新 split_dict
+            import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
+
--- a/build_modellel.py
+++ b/build_modellel.py
@@ -0,0 +1,111 @@
+from pathlib import Path
+from modeller import *
+from modeller.automodel import *    # 加载 AutoModel 类
+import time
+import os
+# micromamba create -n modeller modeller biopython pymol-open-source biopandas -y -c conda-forge -c salilab
+# 注册码：MODELIRANJE
+# python /opt/software/docking_pipeline/scripts/protein_structure/modeller/build_modellel.py -s /mnt/AppData/task-27/ProteinCompletion/protein.pdb -f /mnt/AppData/task-27/ProteinCompletion/seq.fasta -o ./ -n 1 -m refine.fast
+
+def make_model(structure_file,
+               sequence_file,
+               outdir: str,
+               chain: str,
+               num_loop: int = 2,
+               md_level: str = 'refine.fast'
+               ):
+    
+    print("***************************************************")
+    print("md_level ====",md_level)
+    print("***************************************************")
+
+    p_struct = Path(structure_file)
+    p_seq = Path(sequence_file)
+    structure = p_struct.stem
+    sequence = p_seq.stem
+    # 开始时间
+    time_start = time.time()
+    # 对齐蛋白质结构和序列
+    env1 = Environ()
+    # 从 PDB 文件中读取模型并将其加入对齐对象中
+    mdl = Model(env1, file=structure_file, model_segment=(f'FIRST:{chain}', f'LAST:{chain}'))
+    # print(mdl)
+    aln = Alignment(env1)
+    # print(aln)
+    aln.append_model(mdl, align_codes=structure, atom_files=structure_file)
+    # 将序列添加到对齐对象中
+    aln.append(file=sequence_file, align_codes=sequence)
+    # 进行 2D 对齐
+    aln.align2d()
+    # 将对齐结果写入文件
+    aln.write(file=f'{outdir}/alignment.ali', alignment_format='PIR')
+    aln.write(file=f'{outdir}/alignment.pap', alignment_format='PAP')
+
+    log.verbose()
+
+    # 重建蛋白质结构
+    env2 = Environ()
+    # 设置输入原子文件的目录
+    env2.io.atom_files_directory = ['.']
+    # 生成模型，使用自动调整模型类 LoopModel
+    loop_model = LoopModel(env2,
+                           alnfile=f'{outdir}/alignment.ali',
+                           knowns=structure,
+                           sequence=sequence,
+                           loop_assess_methods=(assess.DOPE, assess.GA341))
+    # 设置模型数量
+    # loop_model.starting_model = 1
+    # loop_model.ending_model = int(num_loop)
+
+    # 设置循环模型数量
+    # 数量规则：(end - start) + 1
+    loop_model.loop.starting_model = 1
+    loop_model.loop.ending_model = int(num_loop)
+    # 设置 MD 优化函数为 "refine.slow" 或 "refine.fast
+    if md_level.strip() == 'refine.slow':
+        loop_model.loop.md_level = refine.slow
+    elif md_level.strip() == 'refine.very_fast':
+        loop_model.loop.md_level = refine.very_fast
+    elif md_level.strip() == 'refine.fast':
+        loop_model.loop.md_level = refine.fast
+        
+    # 生成模型
+    loop_model.make()
+    end_time = time.time()
+    print(f"Time cost: {end_time - time_start}s")
+
+
+def fasta_to_ali(fasta_file, outdir):
+    if os.path.exists(outdir) is False:
+        os.makedirs(outdir)
+
+    p_fasta = Path(fasta_file)
+    sequence = p_fasta.stem
+    with open(fasta_file, 'r') as f:
+        seq = f.readlines()
+    seq = seq[1].strip()
+    ali_file = f'{outdir}/{sequence}_full.ali'
+    with open(ali_file, 'w') as f:
+        f.write(f'>P1;{sequence}_full\n')
+        f.write(f'sequence:{sequence}_full:::::::0.00: 0.00\n')
+        f.write(f'{seq}*')
+    return ali_file
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Build model by Modeller")
+    parser.add_argument("-s", "--structure", help="Structure file")
+    parser.add_argument("-o", "--outdir", help="Output directory")
+    parser.add_argument("-f", "--fasta", help="Fasta file")
+    parser.add_argument("-n", "--num_loop", help="Number of loop model")
+    parser.add_argument("-m", "--md_level", help="MD level")
+    parser.add_argument("-c", "--chain", help="your fix chain ID")
+    args = parser.parse_args()
+
+    sequence_file = fasta_to_ali(args.fasta, args.outdir)
+
+    make_model(args.structure, sequence_file,
+               args.outdir, args.chain,args.num_loop, args.md_level)
+
+# python build_modellel.py -s 5sws_fixer.pdb -o ./5swsmodellerfix -f rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c A
+# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c D
--- a/build_modeller.py
+++ b/build_modeller.py
@@ -0,0 +1,115 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from modeller import *
+from modeller.automodel import *
+import time
+from typing import List
+
+@dataclass
+class PDBModeler:
+    structure_file: Path
+    fasta_file: Path
+    outdir: Path
+    chain: str
+    num_loop: int = 2
+    md_level: str = 'refine.fast' # refine.very_fast or refine.slow optional
+
+    def __post_init__(self):
+        self.structure = self.structure_file.stem
+        self.sequence = self.fasta_file.stem 
+        self.ali_file = self.fasta_to_ali()
+
+    def make_model(self):
+        print("***************************************************")
+        print("md_level ====", self.md_level)
+        print("***************************************************")
+
+        time_start = time.time()
+
+        env1 = Environ()
+        mdl = Model(env1, file=self.structure_file.as_posix(), model_segment=(f'FIRST:{self.chain}', f'LAST:{self.chain}'))
+        aln = Alignment(env1)
+        aln.append_model(mdl, align_codes=self.structure, atom_files=self.structure_file.as_posix())
+        aln.append(file=self.ali_file.as_posix(), align_codes=self.sequence)
+        aln.align2d()
+        aln.write(file=(self.outdir / 'alignment.ali').as_posix(), alignment_format='PIR')
+        aln.write(file=(self.outdir / 'alignment.pap').as_posix(), alignment_format='PAP')
+
+        log.verbose()
+
+        env2 = Environ()
+        env2.io.atom_files_directory = ['.']
+        loop_model = LoopModel(env2,
+                               alnfile=(self.outdir / 'alignment.ali').as_posix(),
+                               knowns=self.structure,
+                               sequence=self.sequence,
+                               loop_assess_methods=(assess.DOPE, assess.GA341))
+        # 设置循环模型数量
+        # 数量规则：(end - start) + 1
+        loop_model.loop.starting_model = 1
+        loop_model.loop.ending_model = self.num_loop
+        # 设置 MD 优化函数为 "refine.slow" 或 "refine.fast
+        if self.md_level.strip() == 'refine.slow':
+            loop_model.loop.md_level = refine.slow
+        elif self.md_level.strip() == 'refine.very_fast':
+            loop_model.loop.md_level = refine.very_fast
+        elif self.md_level.strip() == 'refine.fast':
+            loop_model.loop.md_level = refine.fast
+
+        # 调用 LoopModel 的 make 方法
+        loop_model.make()
+        end_time = time.time()
+        print(f"Time cost: {end_time - time_start}s")
+
+        # 获取所有成功生成的模型文件的路径
+        model_files = self.get_model_files(loop_model)
+        if model_files:
+            print(f"Model files: {[file.name for file in model_files]}")
+        else:
+            print("No model files found.")
+
+        return model_files
+
+    def get_model_files(self, loop_model) -> List[Path]:
+        # 检查 loop_model.loop.outputs 列表，收集所有成功生成的模型文件
+        model_files = []
+        for output in loop_model.loop.outputs:
+            if output.get('failure') is None:
+                model_files.append(Path(output.get('name')))
+        return model_files
+
+    def fasta_to_ali(self) -> Path:
+        if not self.outdir.exists():
+            self.outdir.mkdir(parents=True, exist_ok=True)
+
+        ali_file = self.outdir / f'{self.sequence}.ali'
+        if ali_file.exists():
+            ali_file.unlink()
+
+        with open(self.fasta_file, 'r') as f:
+            seq = f.readlines()[1].strip()
+
+        with open(ali_file, 'w') as f:
+            f.write(f'>P1;{self.sequence}\n')
+            f.write(f'sequence:{self.sequence}:::::::0.00: 0.00\n')
+            f.write(f'{seq}*')
+
+        return ali_file
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Build model by Modeller")
+    parser.add_argument("-s", "--structure", help="Structure file")
+    parser.add_argument("-o", "--outdir", help="Output directory")
+    parser.add_argument("-f", "--fasta", help="Fasta file")
+    parser.add_argument("-n", "--num_loop", help="Number of loop model")
+    parser.add_argument("-m", "--md_level", help="MD level")
+    parser.add_argument("-c", "--chain", help="Chain ID")
+    args = parser.parse_args()
+
+    modeler = PDBModeler(Path(args.structure), Path(args.fasta), Path(args.outdir),
+                         args.chain, int(args.num_loop), args.md_level)
+    modeler.make_model()
+
+# test command
+# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.very_fast -c D