first version
This commit is contained in:
465
script/biotools.py
Normal file
465
script/biotools.py
Normal file
@@ -0,0 +1,465 @@
|
||||
from email import header
|
||||
from unittest.mock import Base
|
||||
|
||||
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :biotools.py
|
||||
@Description: :
|
||||
@Date :2021/06/22 16:36:34
|
||||
@Author :hotwa
|
||||
@version :1.0
|
||||
'''
|
||||
import os,re,time
|
||||
from Bio.PDB.PDBParser import PDBParser
|
||||
from Bio.PDB import PDBIO,Select,parse_pdb_header,PDBList
|
||||
from tools import mdcheck,get_path_contents
|
||||
import requests
|
||||
import random,string
|
||||
from pathlib import Path
|
||||
|
||||
file_dir = os.path.dirname(os.path.realpath(__file__)) # 当前python文件的绝对路径
|
||||
residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')]
|
||||
|
||||
""" getpdbfile function"""
|
||||
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 -- '+''.join(random.sample(string.ascii_letters+string.digits, 32))
|
||||
HEADER = {'User-Agent': agent}
|
||||
|
||||
def getpdbfile(pdbid,dir): # 下载备用,不是很稳定
|
||||
pdbid = pdbid.upper()
|
||||
res = os.path.exists(f'{pdbid}.pdb')
|
||||
if not res:
|
||||
r = requests.get(url = f'https://files.rcsb.org/download/{pdbid}.pdb1',verify=False,headers=HEADER)
|
||||
_file_path = os.path.join(dir,f'{pdbid}.pdb')
|
||||
with open(_file_path,'wb+') as f:
|
||||
f.write(r.content)
|
||||
|
||||
""" getpdbfile function"""
|
||||
|
||||
|
||||
class PDBLengthError(BaseException):
|
||||
def __init__(self,arg):
|
||||
self.arg = arg
|
||||
|
||||
def downloadpdbfile(pdbid,pdir,overwrite=True,changename=True):
|
||||
"""downloadpdbfile [summary]
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:param pdbid: like code 5xv7
|
||||
:type pdbid: string
|
||||
:param pdir: save pdb file path
|
||||
:type pdir: string
|
||||
:param overwrite: overwrite pdb file, defaults to True
|
||||
:type overwrite: bool, optional
|
||||
:param changename: change .ent to .pdb, defaults to True
|
||||
:type changename: bool, optional
|
||||
"""
|
||||
if isinstance(pdbid,list):
|
||||
for i in pdbid:
|
||||
downloadpdbfile_sub(pdbid=i,dir = pdir,overwrite=overwrite,changename=changename)
|
||||
elif isinstance(pdbid,str):
|
||||
if len(pdbid) == 4:
|
||||
downloadpdbfile_sub(pdbid=pdbid,dir = pdir,overwrite=overwrite,changename=changename)
|
||||
else:
|
||||
raise PDBLengthError('pdbid length error!')
|
||||
|
||||
|
||||
def downloadpdbfile_sub(pdbid,dir,overwrite,changename):
|
||||
_pdbname = f'pdb{pdbid}.ent'
|
||||
_pdbname_new = f'{pdbid}.pdb'
|
||||
if not overwrite:
|
||||
_path = os.path.join(dir,_pdbname_new)
|
||||
if os.path.exists(_path):
|
||||
...
|
||||
else:
|
||||
pdbl = PDBList()
|
||||
try:
|
||||
pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite)
|
||||
except FileNotFoundError: # 调用biopython下载接口失败,使用自己的接口
|
||||
getpdbfile(pdbid,dir)
|
||||
if changename:
|
||||
srcFile = os.path.join(dir,_pdbname)
|
||||
dstFile = os.path.join(dir,_pdbname_new)
|
||||
try:
|
||||
os.rename(srcFile,dstFile)
|
||||
except FileExistsError as e: # 如果重名名的文件存在,则表示该文件已经下载
|
||||
...
|
||||
except Exception as e:
|
||||
raise e
|
||||
elif overwrite: # 覆盖之前已经下载的pdb文件
|
||||
pdbl = PDBList()
|
||||
try:
|
||||
pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite)
|
||||
except FileNotFoundError: # 调用biopython下载接口失败,使用自己的接口
|
||||
getpdbfile(pdbid,dir)
|
||||
if changename:
|
||||
srcFile = os.path.join(dir,_pdbname)
|
||||
dstFile = os.path.join(dir,_pdbname_new)
|
||||
try:
|
||||
os.rename(srcFile,dstFile)
|
||||
except Exception as e:
|
||||
raise e
|
||||
else:
|
||||
print('error overwrite params!')
|
||||
|
||||
class ChainSelect(Select):
|
||||
def __init__(self,chain_string='A'):
|
||||
super(Select,self).__init__()
|
||||
self.chain_string = chain_string
|
||||
|
||||
def accept_chain(self, chain):
|
||||
if str(chain.__repr__()) == '<Chain id={}>'.format(self.chain_string): # judge chain name
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
class ResidueSelect(Select):
|
||||
"""ResidueSelect ues by select
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:param Select: [description]
|
||||
:type Select: [type]
|
||||
"""
|
||||
def __init__(self,residue):
|
||||
super(Select,self).__init__()
|
||||
self.residue = residue
|
||||
|
||||
def accept_residue(self, residue):
|
||||
if residue.get_resname()==self.residue:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def return_path(path):
|
||||
# 判断是否为绝对路径
|
||||
path_split_string = os.path.split(path)
|
||||
if os.path.isdir(path_split_string[0]) and os.path.isfile(path):
|
||||
return path # 输入绝对路径,返回绝对路径
|
||||
elif bool(path_split_string[0]) == False:
|
||||
return os.path.join(file_dir,path) # 尝试在当前目录下自动补全路径
|
||||
elif not (path_split_string[1][-4:] == '.pdb' or '.ent'):
|
||||
raise ValueError(f'this file, {path} is not a valid pdb file')
|
||||
else:
|
||||
raise ValueError(f'valid path: {path}')
|
||||
|
||||
# ! error class for Bdp
|
||||
class DoiError(BaseException):
|
||||
def __init__(self,arg):
|
||||
self.arg = arg
|
||||
|
||||
class ChainError(BaseException):
|
||||
def __init__(self,arg):
|
||||
self.arg = arg
|
||||
|
||||
|
||||
class Bdp(object):
|
||||
__slots__ = ['path','sid','mole_id','mole_struct','create_path']
|
||||
|
||||
def __init__(self,path,sid):
|
||||
self.path = path
|
||||
self.create_path = os.path.dirname(os.path.dirname(path))
|
||||
self.sid = sid
|
||||
self.mole_id = []
|
||||
self.mole_struct = []
|
||||
# 初始化数据
|
||||
self.read_formula()
|
||||
self.init_path()
|
||||
|
||||
@property
|
||||
def chain_num(self):
|
||||
s = self.read_struct()
|
||||
first_model = s[0]
|
||||
return len(first_model)
|
||||
|
||||
@property
|
||||
def header_infos(self):
|
||||
return parse_pdb_header(self.path)
|
||||
|
||||
@property
|
||||
def doi(self):
|
||||
_journal = self.header_infos['journal']
|
||||
_doi = _journal.split()[-1]
|
||||
if _doi.startswith('10.'):
|
||||
return _doi
|
||||
else:
|
||||
raise DoiError(f'current pdb does not doi, {_journal}')
|
||||
|
||||
def mkdir(self,pname,md = False):
|
||||
"""mkdir [summary]
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:param pname: [description]
|
||||
:type pname: [type]
|
||||
:param md: [dict], defaults to False
|
||||
:type md: [dict], optional
|
||||
"""
|
||||
def wrapper(func):
|
||||
def deco(self,*arg,**kwargs):
|
||||
create_path = os.path.join(self.create_path,pname)
|
||||
if os.path.exists(create_path):
|
||||
...
|
||||
else:
|
||||
os.mkdir(create_path)
|
||||
# create markdown
|
||||
mdc = mdcheck(md)
|
||||
header_info = f'''---
|
||||
title: {mdc['title']}
|
||||
date: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}
|
||||
type: {mdc['typeinfo']}
|
||||
---
|
||||
[TOC]
|
||||
# Description
|
||||
{mdc['description']}
|
||||
'''
|
||||
create_path_md = os.path.join(create_path,'README.md')
|
||||
with open(create_path_md,'w+') as f:
|
||||
f.write(header_info)
|
||||
func(self,*arg,**kwargs)
|
||||
return deco
|
||||
return wrapper
|
||||
|
||||
def value_check(self):
|
||||
try:
|
||||
if len(self.mole_id) == len(self.mole_struct):
|
||||
print('check pass!')
|
||||
else:
|
||||
raise ValueError('molecule identifier do not match formula identifier! manual check')
|
||||
except :
|
||||
raise ValueError
|
||||
|
||||
@staticmethod
|
||||
def read_line_list(first_column,path):
|
||||
"""read_line_list 读取第一列为first_column字符串的行,存为列表返回
|
||||
|
||||
[extended_summary]
|
||||
|
||||
Arguments:
|
||||
first_column {[string]} -- [pdb文件内容第一列常常为大写]
|
||||
path {[string]} -- [路径]
|
||||
|
||||
Returns:
|
||||
[iter] -- [含有first_column字符串的生成器]
|
||||
"""
|
||||
stringiter = Bdp.stringlinesiter(file = path)
|
||||
stringiter = map(lambda x:x.strip(), stringiter)
|
||||
return filter(lambda x:x.split()[0]==first_column,stringiter)
|
||||
|
||||
@staticmethod
|
||||
def stringlinesiter(file):
|
||||
with open(file, 'r+', encoding='utf-8') as f:
|
||||
yield from f.readlines()
|
||||
|
||||
@mkdir(self = 'self',pname = 'split_molecule')
|
||||
def split_molecule(self,residue):
|
||||
"""split_molecule [split molecule or residue line save to file]
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:param residue: [select residue]
|
||||
:type residue: [string]
|
||||
:extract_chain: extract molecule corresponding chain
|
||||
:type extract_chain: bool
|
||||
"""
|
||||
remove_list,residue = [],str(residue)
|
||||
s = self.read_struct()
|
||||
chainlist = self.getchain_str()
|
||||
# 对每条链上都与这个小分子结合的链上的结合信息都提取出来
|
||||
for i in chainlist:
|
||||
io = PDBIO()
|
||||
io.set_structure(s[0][i])
|
||||
sidc = self.sid.replace('.pdb', '') if '.pdb' in self.sid else self.sid
|
||||
savename = f'{sidc}_{i}_{residue}.pdb'
|
||||
path = os.path.join(self.create_path,'split_molecule',savename)
|
||||
io.save(path,ResidueSelect(residue))
|
||||
if os.path.exists(path):
|
||||
with open(path,'r+') as f:
|
||||
content = f.readline()
|
||||
if content.strip() == 'END':
|
||||
# 该链没有小分子,删除文件
|
||||
# print(f'{savename} this chain have not molecule remove it')
|
||||
remove_list.append(path)
|
||||
for i in remove_list:
|
||||
os.remove(i) # remove the empty molecule file
|
||||
|
||||
|
||||
def getchain_str(self):
|
||||
"""getchain_str retrun chain symbol name
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:return: [description]
|
||||
:rtype: list
|
||||
"""
|
||||
_l = []
|
||||
p = PDBParser(PERMISSIVE=1)
|
||||
s = p.get_structure(self.sid, self.path)
|
||||
chain_gennerate = s[0].copy().get_chains()
|
||||
for i in chain_gennerate:
|
||||
_l.append(i.id)
|
||||
_l.sort()
|
||||
return _l
|
||||
|
||||
def site_exists(self,residue,num):
|
||||
"""site_exists 判断改蛋白的某个位点是否存在,并返回该位点所在的链
|
||||
|
||||
用于判断结合位点的链在哪里
|
||||
|
||||
:param residue: 残基名称
|
||||
:type residue: string
|
||||
:param num: 残基编号
|
||||
:type num: int
|
||||
:return: chain
|
||||
:rtype: list
|
||||
"""
|
||||
_l = []
|
||||
num = int(num)
|
||||
residue = residue.strip().upper()
|
||||
chainlist = self.getchain_str()
|
||||
s = self.read_struct()
|
||||
for i in chainlist:
|
||||
try:
|
||||
resname_from_pdb = s[0][i][num].get_resname()
|
||||
if residue == resname_from_pdb:
|
||||
_l.append(i)
|
||||
except KeyError as e:
|
||||
print(e,f'chain {i} not exist {residue}-{num}')
|
||||
return _l
|
||||
|
||||
def _split_chain_save_struct(self,obj,sid):
|
||||
io = PDBIO()
|
||||
io.set_structure(obj)
|
||||
sidc = sid.replace('.pdb', '') if '.pdb' in sid else sid
|
||||
name,path = f'{sidc}_{obj.id}.pdb', self.create_path
|
||||
io.save(os.path.join(path,'split_chain',name),ChainSelect(obj.id))
|
||||
|
||||
@mkdir(self = 'self',pname ='split_chain')
|
||||
def split_chain(self,extract='auto'):
|
||||
"""split_chain [summary]
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:param extract: [extract the chain], defaults to 'auto' extract all chains
|
||||
:type extract: [string], optional
|
||||
:raises ValueError: [save to pdb in current path in ./splitchain]
|
||||
"""
|
||||
# 将传入的蛋白文件按照蛋白链分割并按照 pdbid_A_m.pdb 重新命名 A代表链,m表示该链含有小分子
|
||||
p = PDBParser(PERMISSIVE=1)
|
||||
s = p.get_structure(self.sid, self.path)
|
||||
if extract == 'auto':
|
||||
chainlist = self.getchain_str()
|
||||
for j in chainlist:
|
||||
self._split_chain_save_struct(obj=s[0][j],sid=self.sid)
|
||||
else:
|
||||
chainlist = self.getchain_str()
|
||||
if extract not in chainlist:
|
||||
raise ChainError(f'The {extract} chain not exists !')
|
||||
if extract in chainlist:
|
||||
extract = extract.upper()
|
||||
self._split_chain_save_struct(obj=s[0][extract],sid=self.sid)
|
||||
else:
|
||||
raise ValueError
|
||||
|
||||
def read_struct(self):
|
||||
"""read_struct read pdb structure
|
||||
|
||||
https://biopython-cn.readthedocs.io/zh_CN/latest/cn/chr11.html#id3
|
||||
一个 Structure 对象的整体布局遵循称为SMCRA(Structure/Model/Chain/Residue/Atom,结构/模型/链/残基/原子)的体系架构:
|
||||
|
||||
结构由模型组成
|
||||
模型由多条链组成
|
||||
链由残基组成
|
||||
多个原子构成残基
|
||||
Returns:
|
||||
[obj] -- [Model]
|
||||
"""
|
||||
p = PDBParser(PERMISSIVE=1)
|
||||
s = p.get_structure(self.sid, self.path)
|
||||
return s
|
||||
|
||||
def get_chain(self,chain_str):
|
||||
s = self.read_struct()
|
||||
return s[0][chain_str]
|
||||
|
||||
def init_path(self):
|
||||
_p = return_path(self.path)
|
||||
self.path = _p
|
||||
if os.path.isfile(_p) == False:
|
||||
raise FileNotFoundError(f'file {_p} not found')
|
||||
|
||||
def read_formula(self):
|
||||
"""read_formula read pdb file molecule information and identifier in this pdb file(just like residue)
|
||||
|
||||
[extended_summary]
|
||||
|
||||
:raises OSError: Do not find this file
|
||||
"""
|
||||
_path = self.path
|
||||
try:
|
||||
with open(_path,'r') as f:
|
||||
text_line = f.readlines()
|
||||
_l = []
|
||||
for i in text_line:
|
||||
li = i.split()
|
||||
if li[0] == 'FORMUL':
|
||||
_l.append(li)
|
||||
self.mole_id.append(li[2])
|
||||
for ii in _l:
|
||||
self.mole_struct.append(''.join(ii[3:]))
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def clean_pdb(self,outfile=None,path=None):
|
||||
__lst = Bdp.read_line_list('ATOM',self.path)
|
||||
_p = Path(path) if path else Path(self.path).absolute().parent
|
||||
_f = outfile if outfile else f"{self.sid}.clean.pdb"
|
||||
write_path = _p.joinpath(_f)
|
||||
if write_path.exists(): write_path.unlink()
|
||||
with open(write_path,'w+') as f:
|
||||
for i in __lst:
|
||||
f.write(f"{i}\n")
|
||||
|
||||
def renamepdbfile(path):
|
||||
"""renamepdbfile 自动对path路径中批量下载的pdb中文件的文件名进行重命名
|
||||
|
||||
[extended_summary]
|
||||
|
||||
Arguments:
|
||||
path {[type]} -- [description]
|
||||
"""
|
||||
_flist = get_path_contents(path)[0]
|
||||
for i in _flist:
|
||||
i = i.strip()
|
||||
basetuple = os.path.split(i)
|
||||
basedir = basetuple[0]
|
||||
_fn = basetuple[1]
|
||||
res = re.match(pattern='pdb([A-Za-z0-9]*).ent',string=_fn)
|
||||
# nfn = res.group(1)
|
||||
# newname =os.path.join(basedir,str(nfn)+'.pbd')
|
||||
# os.rename(src = i,dst = newname)
|
||||
try:
|
||||
nfn = res.group(1)
|
||||
newname =os.path.join(basedir,str(nfn)+'.pdb')
|
||||
os.rename(src = i,dst = newname)
|
||||
except IndexError:
|
||||
raise IndexError(f'Could not find pdbid in this filename: {i}')
|
||||
except AttributeError:
|
||||
continue
|
||||
except FileNotFoundError as e:
|
||||
if os.path.exists(i):
|
||||
raise e
|
||||
elif os.path.exists(newname):
|
||||
continue
|
||||
else:
|
||||
raise e
|
||||
|
||||
|
||||
|
||||
def pdbupdate(path):
|
||||
pl = PDBList(pdb=path)
|
||||
pl.update_pdb()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user