This commit is contained in:
2023-12-04 09:00:16 +08:00
commit f71d0d81a2
3 changed files with 676 additions and 0 deletions

450
analysis_pdb.py Normal file
View File

@@ -0,0 +1,450 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :analysis_pdb.py
@Description: : 分析PDB结构
@Date :2023/11/27 09:57:00
@Author :lyzeng
@Email :pylyzeng@gmail.com
@version :1.0
'''
# micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab
# modeller注册码MODELIRANJE
from dataclasses import dataclass, field
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from typing import List, Dict, Tuple, Optional
from functools import reduce, partial
from Bio.PDB import MMCIFIO, PDBIO, Chain, Structure
from biopandas.pdb import PandasPdb
from pathlib import Path
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import requests
from copy import deepcopy
from pymol import cmd
@dataclass
class PDBAnalyzer:
"""
PDBAnalyzer class for analyzing protein structures from PDB files.
Attributes:
pdb_file (Path): Path to the PDB file to be analyzed.
structure (object): The parsed PDB structure, initialized in __post_init__.
"""
pdb_file: Path
pid: Optional[str] = field(default=None, init=False)
structure: object = field(init=False)
biodf: PandasPdb = field(init=False)
protein_state: str = field(init=False) # Apo or Holo
chain_id_list: List[str] = field(init=False)
def __post_init__(self):
"""
Initialize the PDB structure after the object is created.
"""
parser = PDBParser(QUIET=True)
self.pid = self.pdb_file.stem.lower() if len(self.pdb_file.stem) == 4 else None
self.structure = parser.get_structure('PDB_structure', self.pdb_file.as_posix())
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
def check_continuity(self, chain, missing_char):
"""
Check the continuity of residues in a protein chain.
Args:
chain (Chain): A Chain object from Bio.PDB.
missing_char (str): Character to denote missing residues.
Returns:
List[Tuple[str, int]]: List of tuples containing residue names and their sequence numbers.
"""
def reducer(acc, residue):
# Accumulate residues, filling in gaps with the specified missing character
expected_resseq = acc[-1][1] + 1 if acc else residue.id[1]
while expected_resseq < residue.id[1]:
acc.append((missing_char, expected_resseq))
expected_resseq += 1
acc.append((residue.resname, residue.id[1]))
return acc
return reduce(reducer, chain, [])
def residue_to_single_letter(self, residue_name):
"""
Convert a three-letter residue name to a single-letter code.
Args:
residue_name (str): Three-letter residue name.
Returns:
str: Single-letter residue code or the original string if it's a special character.
"""
# Handle special characters separately
if residue_name in {'-', 'X', ''}:
return residue_name
# Convert standard residue names to single-letter codes
return seq1(residue_name)
def extract_sequences(self, missing_char='-') -> Dict[str, str]:
"""
Extract amino acid sequences from the structure, with gaps denoted by the specified character.
Args:
missing_char (str): Character to use for missing residues (default is '-').
Returns:
Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
"""
# Create a continuity check function with the specified missing character
check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
sequences = {}
# Process each chain in the structure
for model in self.structure:
chains = model.get_list()
for chain in chains:
# Check continuity and get the sequence of residues
chain_sequence = check_continuity_new(chain)
# Convert to single-letter sequence
single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
sequences[chain.id] = single_letter_sequence
return sequences
def extract_sequences_info(self) -> Dict[str, List[int]]:
"""
Get missing residues information for each chain in the structure.
Utilizes map and filter for efficient processing.
Returns:
Dict[str, List[int]]: Dictionary mapping each chain ID to a list of missing residue numbers.
"""
def find_missing(chain):
observed = [residue.get_id()[1] for residue in chain if residue.id[0] == ' ']
if observed:
full_range = range(min(observed), max(observed) + 1)
return chain.get_id(), sorted(set(full_range) - set(observed))
return chain.get_id(), []
chains = [chain for model in self.structure for chain in model]
missing_info = map(find_missing, chains)
return dict(filter(lambda x: x[1], missing_info))
def extract_chain(self, chain_id: str):
"""
Extract a specific chain from the structure. use biopython
Args:
chain_id (str): ID of the chain to be extracted.
Returns:
object: An object containing the extracted chain with a save method.
"""
target_chain = next((chain for model in self.structure for chain in model if chain.id == chain_id), None)
if target_chain is None:
raise ValueError(f"Chain {chain_id} not found in the structure.")
def save_to_file(filename: str, file_format: str = 'pdb'):
"""
Save the extracted chain to a file in the specified format.
Args:
filename (str): Name of the file to save.
file_format (str): Format of the file ('pdb' or 'cif'). Default is 'pdb'.
"""
if file_format not in ['pdb', 'cif']:
raise ValueError("Unsupported file format. Use 'pdb' or 'cif'.")
if file_format == 'pdb':
io = PDBIO()
elif file_format == 'cif':
io = MMCIFIO()
io.set_structure(target_chain)
io.save(filename)
return type('ChainExtractor', (object,), {'save': save_to_file})
def get_missing_residues_as_loops(self) -> Dict[str, List[Tuple[int, int, int]]]:
"""
Get missing residues information formatted as loops for PyRosetta, along with chain identifiers.
Returns:
Dict[str, List[Tuple[int, int, int]]]: Dictionary mapping chain IDs to lists of tuples representing loop regions (start, end, cut).
"""
missing_residues = self.extract_sequences_info()
loops_by_chain = {}
for chain_id, missing_res_nums in missing_residues.items():
loops = []
for num in missing_res_nums:
# Define the loop region, assuming simple start and end points
loops.append((num - 1, num + 1, num)) # Example loop definition
loops_by_chain[chain_id] = loops
return loops_by_chain
def change_chain_identifier(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PandasPdb:
"""
Change the identifier of a specific chain. default use biopandas , not test yet.
Args:
old_chain_id (str): Original identifier of the chain.
new_chain_id (str): New identifier to be assigned to the chain.
return:
object: An new object,which not influce original self.biodata, containing the changed chain with a save method.
"""
biodata = self.split_chain(old_chain_id) if split else deepcopy(self.biodata) # get ATOM colum
biodata.df['ATOM']['chain_id'] = biodata.df['ATOM']['chain_id'].replace(old_chain_id.strip().upper(), new_chain_id.strip().upper())
return biodata # use to_pdb to save
def change_chain_identifier_biopython(self, old_chain_id: str, new_chain_id: str, split: bool = False) -> PDBIO:
io = PDBIO()
def change_id(chain):
if chain.id == old_chain_id:
chain.id = new_chain_id
return chain
if split:
# 使用 filter 和 map 创建一个只包含已更改的特定链的新结构
new_structure = Structure.Structure("modified_structure")
for model in self.structure:
filtered_chains = filter(lambda c: c.id == old_chain_id, model)
changed_chains = map(change_id, filtered_chains)
new_model = reduce(lambda m, c: m.add(c) or m, changed_chains, Structure.Model.Model(model.id))
new_structure.add(new_model)
io.set_structure(new_structure)
else:
# 直接在整个结构上更改ID
all_chains = (chain for model in self.structure for chain in model)
list(map(change_id, all_chains))
io.set_structure(self.structure)
return io
def split_chain(self, chain_id: str) -> PandasPdb:
biodata = deepcopy(self.biodata)
df = biodata.df['ATOM'] # get ATOM colum
tmp = PandasPdb() # create empty pdb file object
tmp.df['ATOM'] = df[df['chain_id'] == chain_id.strip().upper()]
return tmp # use to_pdb method save
def save_biodata(self, out_file: Path):
self.biodata.to_pdb(out_file.as_posix())
@staticmethod
def write_seq_to_fasta_single_line(seq_record: SeqRecord, output_file: Path):
with open(output_file, 'w') as output_handle:
output_handle.write(f">{seq_record.description}\n")
output_handle.write(str(seq_record.seq) + "\n")
def download_fasta(self, pdb_id: str = None):
if pdb_id == None:
pdb_id = self.pid
if pdb_id == None:
raise ValueError("PDB ID not found.")
url = f"https://www.rcsb.org/fasta/entry/{pdb_id.upper()}"
response = requests.get(url)
if response.status_code == 200:
output_file = self.pdb_file.parent / f"{pdb_id}.fasta"
with open(output_file, 'w') as file:
file.write(response.text)
return output_file
else:
raise Exception(f"Failed to download FASTA file for PDB ID {pdb_id}")
def filter_sequences(self, chain_id: str) -> List[SeqRecord]:
return list(filter(lambda x: f"Chain {chain_id}" in x.description, self.read_fasta()))
def read_fasta(self) -> SeqRecord:
fasta_file = self.pdb_file.parent / f"{self.pid}.fasta"
if not fasta_file.exists():
self.download_fasta()
return SeqIO.parse(fasta_file.as_posix(), "fasta")
@dataclass
class PDBAlign:
template_file: Path
target_file: Path
out_file: Path = field(default=None) # 输出文件路径
def align(self):
cmd.reinitialize()
# 首先,加载模板结构
cmd.load(self.template_file.as_posix(), "template")
# 加载并对齐所有目标结构
cmd.load(self.target_file.as_posix(), "target")
cmd.align("target", "template")
return cmd.get_pdbstr('target')
def main(PDB_ID, PDB_file_path):
# 示例
# PDB_ID = '5sws'
# PDB_file_path = Path(f'{PDB_ID}.pdb')
analyzer = PDBAnalyzer(PDB_file_path)
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
print(f'Residues info for {PDB_ID}: \n',sequences)
missing_info = analyzer.extract_sequences_info()
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
# 示例: 使用biopython提取A链将会保留HETATM
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
# 示例: 使用biopandas提取A链将不会保留HETATM
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
# A链改B链, 并分割保存为单独文件
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
# 分割所有的链
split_dict = {}
for j in analyzer.chain_id_list:
fn = Path(f'{PDB_ID}_{j}.pdb')
analyzer.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
# 修复loop区域
from build_modeller import PDBModeler
from modeller import ModellerError
mc_dict = {}
for mc in missing_info:
out_file = f'5sws_{mc}.pdb'
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
if len(mc_fasta) == 1:
mc_fasta = mc_fasta[0]
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
print(f'>{mc_fasta.description}')
print(mc_fasta.seq)
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
try:
modeller_results = modeller.make_model()
except ModellerError:
print(f'Failed to build model for chain {mc}')
print(f'No loops detected in {out_fasta_file.name}')
print(f'may pdb file sequence is not correct')
continue
except Exception as e:
raise e
print(f'Model files: {[file.name for file in modeller_results]}')
# change id to original
for i in modeller_results:
manalyzer = PDBAnalyzer(i)
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
if len(modeller_results) == 1:
# use pymol to align
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
pdbstr = aligner.align()
mc_dict[mc] = pdbstr
else:
print('more than one model file, please set num_loop to 1')
else:
raise ValueError(f'only can fix one chain content: {mc_fasta}')
# 使用示例
split_dict.update(mc_dict) # 更新 split_dict
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
"""
从 PDB 字符串导入多个对象,将它们合并,并保存为一个 PDB 文件。
:param pdb_strings: 字典,键为对象名称,值为 PDB 数据字符串。
:param merged_object_name: 合并后的对象名称。
:param output_file: 输出文件的路径。
"""
# 初始化 PyMOL如果在脚本或非交互式环境中使用
cmd.reinitialize()
# 从字符串导入每个对象
for chain_id, pdb_str in pdb_strings.items():
object_name = f"chain_{chain_id}" # 创建唯一的对象名称
cmd.read_pdbstr(pdb_str, object_name)
# 合并所有对象为一个
object_names = [f"chain_{chain_id}" for chain_id in pdb_strings.keys()]
cmd.create(merged_object_name, ' or '.join(object_names))
# 保存合并后的对象
cmd.save(output_file, merged_object_name)
if __name__ == "__main__":
# import argparse
# parser = argparse.ArgumentParser(description="Build model by Modeller")
# parser.add_argument("-s", "--structure", help="Structure file")
# parser.add_argument("-o", "--outdir", help="Output directory")
# parser.add_argument("-f", "--fasta", help="Fasta file")
# parser.add_argument("-n", "--num_loop", help="Number of loop model")
# parser.add_argument("-m", "--md_level", help="MD level")
# parser.add_argument("-c", "--chain", help="Chain ID")
# args = parser.parse_args()
pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')]
for i in pdbfiles:
PDB_file_path = i
PDB_ID = i.stem
analyzer = PDBAnalyzer(PDB_file_path)
sequences = analyzer.extract_sequences(missing_char='-') # 或者 'X', 或者 ''
print(f'Residues info for {PDB_ID}: \n',sequences)
missing_info = analyzer.extract_sequences_info()
print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
# 示例: 使用biopython提取A链将会保留HETATM
chain_extractor = analyzer.extract_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.save('biopython_extracted_chain_A.pdb') # 保存为PDB文件
# 示例: 使用biopandas提取A链将不会保留HETATM
chain_extractor = analyzer.split_chain('A') # 假设要提取的链ID是 'A'
chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb') # 保存为PDB文件
# A链改B链, 并分割保存为单独文件
analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
# 分割所有的链
split_dict = {}
for j in analyzer.chain_id_list:
fn = Path(f'{PDB_ID}_{j}.pdb')
analyzer.split_chain(j).to_pdb(fn.as_posix())
split_dict[j]=fn.read_text()
# 修复loop区域
from build_modeller import PDBModeler
from modeller import ModellerError
mc_dict = {}
for mc in missing_info:
out_file = f'5sws_{mc}.pdb'
analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
if len(mc_fasta) == 1:
mc_fasta = mc_fasta[0]
out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
print(f'>{mc_fasta.description}')
print(mc_fasta.seq)
modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
try:
modeller_results = modeller.make_model()
except ModellerError:
print(f'Failed to build model for chain {mc}')
print(f'No loops detected in {out_fasta_file.name}')
print(f'may pdb file sequence is not correct')
continue
except Exception as e:
raise e
print(f'Model files: {[file.name for file in modeller_results]}')
# change id to original
for i in modeller_results:
manalyzer = PDBAnalyzer(i)
manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
if len(modeller_results) == 1:
# use pymol to align
aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
pdbstr = aligner.align()
mc_dict[mc] = pdbstr
else:
print('more than one model file, please set num_loop to 1')
elif len(mc_fasta) == 0:
continue
else:
raise ValueError(f'only can fix one chain content: {mc_fasta}')
# 使用示例
split_dict.update(mc_dict) # 更新 split_dict
import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')

111
build_modellel.py Normal file
View File

@@ -0,0 +1,111 @@
from pathlib import Path
from modeller import *
from modeller.automodel import * # 加载 AutoModel 类
import time
import os
# micromamba create -n modeller modeller biopython pymol-open-source biopandas -y -c conda-forge -c salilab
# 注册码MODELIRANJE
# python /opt/software/docking_pipeline/scripts/protein_structure/modeller/build_modellel.py -s /mnt/AppData/task-27/ProteinCompletion/protein.pdb -f /mnt/AppData/task-27/ProteinCompletion/seq.fasta -o ./ -n 1 -m refine.fast
def make_model(structure_file,
sequence_file,
outdir: str,
chain: str,
num_loop: int = 2,
md_level: str = 'refine.fast'
):
print("***************************************************")
print("md_level ====",md_level)
print("***************************************************")
p_struct = Path(structure_file)
p_seq = Path(sequence_file)
structure = p_struct.stem
sequence = p_seq.stem
# 开始时间
time_start = time.time()
# 对齐蛋白质结构和序列
env1 = Environ()
# 从 PDB 文件中读取模型并将其加入对齐对象中
mdl = Model(env1, file=structure_file, model_segment=(f'FIRST:{chain}', f'LAST:{chain}'))
# print(mdl)
aln = Alignment(env1)
# print(aln)
aln.append_model(mdl, align_codes=structure, atom_files=structure_file)
# 将序列添加到对齐对象中
aln.append(file=sequence_file, align_codes=sequence)
# 进行 2D 对齐
aln.align2d()
# 将对齐结果写入文件
aln.write(file=f'{outdir}/alignment.ali', alignment_format='PIR')
aln.write(file=f'{outdir}/alignment.pap', alignment_format='PAP')
log.verbose()
# 重建蛋白质结构
env2 = Environ()
# 设置输入原子文件的目录
env2.io.atom_files_directory = ['.']
# 生成模型,使用自动调整模型类 LoopModel
loop_model = LoopModel(env2,
alnfile=f'{outdir}/alignment.ali',
knowns=structure,
sequence=sequence,
loop_assess_methods=(assess.DOPE, assess.GA341))
# 设置模型数量
# loop_model.starting_model = 1
# loop_model.ending_model = int(num_loop)
# 设置循环模型数量
# 数量规则:(end - start) + 1
loop_model.loop.starting_model = 1
loop_model.loop.ending_model = int(num_loop)
# 设置 MD 优化函数为 "refine.slow" 或 "refine.fast
if md_level.strip() == 'refine.slow':
loop_model.loop.md_level = refine.slow
elif md_level.strip() == 'refine.very_fast':
loop_model.loop.md_level = refine.very_fast
elif md_level.strip() == 'refine.fast':
loop_model.loop.md_level = refine.fast
# 生成模型
loop_model.make()
end_time = time.time()
print(f"Time cost: {end_time - time_start}s")
def fasta_to_ali(fasta_file, outdir):
if os.path.exists(outdir) is False:
os.makedirs(outdir)
p_fasta = Path(fasta_file)
sequence = p_fasta.stem
with open(fasta_file, 'r') as f:
seq = f.readlines()
seq = seq[1].strip()
ali_file = f'{outdir}/{sequence}_full.ali'
with open(ali_file, 'w') as f:
f.write(f'>P1;{sequence}_full\n')
f.write(f'sequence:{sequence}_full:::::::0.00: 0.00\n')
f.write(f'{seq}*')
return ali_file
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Build model by Modeller")
parser.add_argument("-s", "--structure", help="Structure file")
parser.add_argument("-o", "--outdir", help="Output directory")
parser.add_argument("-f", "--fasta", help="Fasta file")
parser.add_argument("-n", "--num_loop", help="Number of loop model")
parser.add_argument("-m", "--md_level", help="MD level")
parser.add_argument("-c", "--chain", help="your fix chain ID")
args = parser.parse_args()
sequence_file = fasta_to_ali(args.fasta, args.outdir)
make_model(args.structure, sequence_file,
args.outdir, args.chain,args.num_loop, args.md_level)
# python build_modellel.py -s 5sws_fixer.pdb -o ./5swsmodellerfix -f rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c A
# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.fast -c D

115
build_modeller.py Normal file
View File

@@ -0,0 +1,115 @@
from dataclasses import dataclass, field
from pathlib import Path
from modeller import *
from modeller.automodel import *
import time
from typing import List
@dataclass
class PDBModeler:
structure_file: Path
fasta_file: Path
outdir: Path
chain: str
num_loop: int = 2
md_level: str = 'refine.fast' # refine.very_fast or refine.slow optional
def __post_init__(self):
self.structure = self.structure_file.stem
self.sequence = self.fasta_file.stem
self.ali_file = self.fasta_to_ali()
def make_model(self):
print("***************************************************")
print("md_level ====", self.md_level)
print("***************************************************")
time_start = time.time()
env1 = Environ()
mdl = Model(env1, file=self.structure_file.as_posix(), model_segment=(f'FIRST:{self.chain}', f'LAST:{self.chain}'))
aln = Alignment(env1)
aln.append_model(mdl, align_codes=self.structure, atom_files=self.structure_file.as_posix())
aln.append(file=self.ali_file.as_posix(), align_codes=self.sequence)
aln.align2d()
aln.write(file=(self.outdir / 'alignment.ali').as_posix(), alignment_format='PIR')
aln.write(file=(self.outdir / 'alignment.pap').as_posix(), alignment_format='PAP')
log.verbose()
env2 = Environ()
env2.io.atom_files_directory = ['.']
loop_model = LoopModel(env2,
alnfile=(self.outdir / 'alignment.ali').as_posix(),
knowns=self.structure,
sequence=self.sequence,
loop_assess_methods=(assess.DOPE, assess.GA341))
# 设置循环模型数量
# 数量规则:(end - start) + 1
loop_model.loop.starting_model = 1
loop_model.loop.ending_model = self.num_loop
# 设置 MD 优化函数为 "refine.slow" 或 "refine.fast
if self.md_level.strip() == 'refine.slow':
loop_model.loop.md_level = refine.slow
elif self.md_level.strip() == 'refine.very_fast':
loop_model.loop.md_level = refine.very_fast
elif self.md_level.strip() == 'refine.fast':
loop_model.loop.md_level = refine.fast
# 调用 LoopModel 的 make 方法
loop_model.make()
end_time = time.time()
print(f"Time cost: {end_time - time_start}s")
# 获取所有成功生成的模型文件的路径
model_files = self.get_model_files(loop_model)
if model_files:
print(f"Model files: {[file.name for file in model_files]}")
else:
print("No model files found.")
return model_files
def get_model_files(self, loop_model) -> List[Path]:
# 检查 loop_model.loop.outputs 列表,收集所有成功生成的模型文件
model_files = []
for output in loop_model.loop.outputs:
if output.get('failure') is None:
model_files.append(Path(output.get('name')))
return model_files
def fasta_to_ali(self) -> Path:
if not self.outdir.exists():
self.outdir.mkdir(parents=True, exist_ok=True)
ali_file = self.outdir / f'{self.sequence}.ali'
if ali_file.exists():
ali_file.unlink()
with open(self.fasta_file, 'r') as f:
seq = f.readlines()[1].strip()
with open(ali_file, 'w') as f:
f.write(f'>P1;{self.sequence}\n')
f.write(f'sequence:{self.sequence}:::::::0.00: 0.00\n')
f.write(f'{seq}*')
return ali_file
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Build model by Modeller")
parser.add_argument("-s", "--structure", help="Structure file")
parser.add_argument("-o", "--outdir", help="Output directory")
parser.add_argument("-f", "--fasta", help="Fasta file")
parser.add_argument("-n", "--num_loop", help="Number of loop model")
parser.add_argument("-m", "--md_level", help="MD level")
parser.add_argument("-c", "--chain", help="Chain ID")
args = parser.parse_args()
modeler = PDBModeler(Path(args.structure), Path(args.fasta), Path(args.outdir),
args.chain, int(args.num_loop), args.md_level)
modeler.make_model()
# test command
# python build_modellel.py -s ../5sws_fixer.pdb -o ./5swsmodellerfix -f ../rcsb_pdb_5SWS.fasta -n 1 -m refine.very_fast -c D