first version

2024-09-07 09:12:14 +08:00
commit bc8cc22a9c
27 changed files with 2530 additions and 0 deletions
--- a/script/biotools.py
+++ b/script/biotools.py
@@ -0,0 +1,465 @@
+from email import header
+from unittest.mock import Base
+
+
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file        :biotools.py
+@Description:       :
+@Date     :2021/06/22 16:36:34
+@Author      :hotwa
+@version      :1.0
+'''
+import os,re,time
+from Bio.PDB.PDBParser import PDBParser
+from Bio.PDB import PDBIO,Select,parse_pdb_header,PDBList
+from tools import mdcheck,get_path_contents
+import requests
+import random,string
+from pathlib import Path
+
+file_dir = os.path.dirname(os.path.realpath(__file__)) # 当前python文件的绝对路径
+residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')]
+
+""" getpdbfile function"""
+agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 -- '+''.join(random.sample(string.ascii_letters+string.digits, 32))
+HEADER = {'User-Agent': agent}
+
+def getpdbfile(pdbid,dir): # 下载备用,不是很稳定
+    pdbid = pdbid.upper()
+    res = os.path.exists(f'{pdbid}.pdb')
+    if not res:
+        r = requests.get(url = f'https://files.rcsb.org/download/{pdbid}.pdb1',verify=False,headers=HEADER)
+        _file_path = os.path.join(dir,f'{pdbid}.pdb')
+        with open(_file_path,'wb+') as f:
+            f.write(r.content)
+
+""" getpdbfile function"""
+
+
+class PDBLengthError(BaseException):
+    def __init__(self,arg):
+        self.arg = arg
+
+def downloadpdbfile(pdbid,pdir,overwrite=True,changename=True):
+    """downloadpdbfile [summary]
+    
+    [extended_summary]
+    
+    :param pdbid: like code 5xv7
+    :type pdbid: string
+    :param pdir: save pdb file path
+    :type pdir: string
+    :param overwrite: overwrite pdb file, defaults to True
+    :type overwrite: bool, optional
+    :param changename: change .ent to .pdb, defaults to True
+    :type changename: bool, optional
+    """
+    if isinstance(pdbid,list):
+        for i in pdbid:
+            downloadpdbfile_sub(pdbid=i,dir = pdir,overwrite=overwrite,changename=changename)
+    elif isinstance(pdbid,str):
+        if len(pdbid) == 4:
+            downloadpdbfile_sub(pdbid=pdbid,dir = pdir,overwrite=overwrite,changename=changename)
+    else:
+        raise PDBLengthError('pdbid length error!')
+
+
+def downloadpdbfile_sub(pdbid,dir,overwrite,changename):
+    _pdbname = f'pdb{pdbid}.ent'
+    _pdbname_new = f'{pdbid}.pdb'
+    if not overwrite: 
+        _path = os.path.join(dir,_pdbname_new)
+        if os.path.exists(_path): 
+            ...
+        else:
+            pdbl = PDBList()
+            try:
+                pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite)
+            except FileNotFoundError: # 调用biopython下载接口失败，使用自己的接口
+                getpdbfile(pdbid,dir)
+            if changename:
+                srcFile = os.path.join(dir,_pdbname)
+                dstFile = os.path.join(dir,_pdbname_new)
+                try:
+                    os.rename(srcFile,dstFile)
+                except FileExistsError as e: # 如果重名名的文件存在，则表示该文件已经下载
+                    ...
+                except Exception as e:
+                    raise e
+    elif overwrite: # 覆盖之前已经下载的pdb文件
+        pdbl = PDBList()
+        try:
+            pdbl.retrieve_pdb_file(pdbid,file_format='pdb',pdir = dir,overwrite=overwrite)
+        except FileNotFoundError: # 调用biopython下载接口失败，使用自己的接口
+            getpdbfile(pdbid,dir)
+        if changename:
+            srcFile = os.path.join(dir,_pdbname)
+            dstFile = os.path.join(dir,_pdbname_new)
+            try:
+                os.rename(srcFile,dstFile)
+            except Exception as e:
+                raise e
+    else:
+        print('error overwrite params!')
+
+class ChainSelect(Select):
+    def __init__(self,chain_string='A'):
+        super(Select,self).__init__()
+        self.chain_string = chain_string
+
+    def accept_chain(self, chain):
+        if str(chain.__repr__()) == '<Chain id={}>'.format(self.chain_string): # judge chain name
+            return 1
+        else:
+            return 0
+
+class ResidueSelect(Select):
+    """ResidueSelect ues by select 
+    
+    [extended_summary]
+    
+    :param Select: [description]
+    :type Select: [type]
+    """
+    def __init__(self,residue):
+        super(Select,self).__init__()
+        self.residue = residue
+
+    def accept_residue(self, residue):
+        if residue.get_resname()==self.residue:
+            return 1
+        else:
+            return 0
+
+def return_path(path):
+    # 判断是否为绝对路径
+    path_split_string = os.path.split(path)
+    if os.path.isdir(path_split_string[0]) and os.path.isfile(path):
+        return path # 输入绝对路径，返回绝对路径
+    elif bool(path_split_string[0]) == False:
+        return os.path.join(file_dir,path) # 尝试在当前目录下自动补全路径
+    elif  not (path_split_string[1][-4:] == '.pdb' or '.ent'):
+        raise ValueError(f'this file, {path} is not a valid pdb file')
+    else:
+        raise ValueError(f'valid path: {path}')
+
+# ! error class for Bdp
+class DoiError(BaseException):
+    def __init__(self,arg):
+        self.arg = arg
+
+class ChainError(BaseException):
+    def __init__(self,arg):
+        self.arg = arg
+
+
+class Bdp(object):
+    __slots__ = ['path','sid','mole_id','mole_struct','create_path']
+
+    def __init__(self,path,sid):
+        self.path = path
+        self.create_path = os.path.dirname(os.path.dirname(path))
+        self.sid = sid
+        self.mole_id = []
+        self.mole_struct = []
+        # 初始化数据
+        self.read_formula()
+        self.init_path()
+        
+    @property
+    def chain_num(self):
+        s = self.read_struct()
+        first_model = s[0]
+        return len(first_model)
+
+    @property
+    def header_infos(self):
+        return parse_pdb_header(self.path)
+
+    @property
+    def doi(self):
+        _journal = self.header_infos['journal']
+        _doi = _journal.split()[-1]
+        if _doi.startswith('10.'):
+            return _doi
+        else:
+            raise DoiError(f'current pdb does not doi, {_journal}')
+
+    def mkdir(self,pname,md = False):
+        """mkdir [summary]
+        
+        [extended_summary]
+        
+        :param pname: [description]
+        :type pname: [type]
+        :param md: [dict], defaults to False
+        :type md: [dict], optional
+        """
+        def wrapper(func):
+            def deco(self,*arg,**kwargs):
+                create_path = os.path.join(self.create_path,pname)
+                if os.path.exists(create_path):
+                    ...
+                else:
+                    os.mkdir(create_path)
+                # create markdown
+                mdc = mdcheck(md)
+                header_info = f'''---
+    title: {mdc['title']}
+    date: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}
+    type: {mdc['typeinfo']}
+    ---
+    [TOC]
+    # Description
+    {mdc['description']}
+    '''
+                create_path_md = os.path.join(create_path,'README.md')
+                with open(create_path_md,'w+') as f:
+                    f.write(header_info)
+                func(self,*arg,**kwargs)
+            return deco
+        return wrapper
+
+    def value_check(self):
+        try:
+            if len(self.mole_id) == len(self.mole_struct):
+                print('check pass!')
+            else:
+                raise ValueError('molecule identifier do not match formula identifier! manual check')
+        except :
+            raise ValueError
+
+    @staticmethod
+    def read_line_list(first_column,path):
+        """read_line_list 读取第一列为first_column字符串的行，存为列表返回
+
+        [extended_summary]
+
+        Arguments:
+            first_column {[string]} -- [pdb文件内容第一列常常为大写]
+            path {[string]} -- [路径]
+
+        Returns:
+            [iter] -- [含有first_column字符串的生成器]
+        """
+        stringiter = Bdp.stringlinesiter(file = path)
+        stringiter = map(lambda x:x.strip(), stringiter)
+        return filter(lambda x:x.split()[0]==first_column,stringiter)
+
+    @staticmethod
+    def stringlinesiter(file):
+        with open(file, 'r+', encoding='utf-8') as f:
+            yield from f.readlines()
+
+    @mkdir(self = 'self',pname = 'split_molecule')
+    def split_molecule(self,residue):
+        """split_molecule [split molecule or residue line save to file]
+        
+        [extended_summary]
+        
+        :param residue: [select residue]
+        :type residue: [string]
+        :extract_chain: extract molecule corresponding chain
+        :type extract_chain: bool
+        """
+        remove_list,residue = [],str(residue)
+        s = self.read_struct()
+        chainlist = self.getchain_str()
+        # 对每条链上都与这个小分子结合的链上的结合信息都提取出来
+        for i in chainlist:
+            io = PDBIO()
+            io.set_structure(s[0][i])
+            sidc = self.sid.replace('.pdb', '') if '.pdb' in self.sid else self.sid
+            savename = f'{sidc}_{i}_{residue}.pdb'
+            path = os.path.join(self.create_path,'split_molecule',savename)
+            io.save(path,ResidueSelect(residue))
+            if os.path.exists(path):
+                with open(path,'r+') as f:
+                    content = f.readline()
+                    if content.strip() == 'END':
+                        # 该链没有小分子，删除文件
+                        # print(f'{savename} this chain have not molecule remove it')
+                        remove_list.append(path)
+        for i in remove_list:
+            os.remove(i) # remove the empty molecule file
+
+
+    def getchain_str(self):
+        """getchain_str retrun chain symbol name
+        
+        [extended_summary]
+        
+        :return: [description]
+        :rtype: list
+        """
+        _l = []
+        p = PDBParser(PERMISSIVE=1)
+        s = p.get_structure(self.sid, self.path)
+        chain_gennerate = s[0].copy().get_chains()
+        for i in chain_gennerate:
+            _l.append(i.id)
+            _l.sort()
+        return _l
+
+    def site_exists(self,residue,num):
+        """site_exists 判断改蛋白的某个位点是否存在，并返回该位点所在的链
+        
+        用于判断结合位点的链在哪里
+
+        :param residue: 残基名称
+        :type residue: string
+        :param num: 残基编号
+        :type num: int
+        :return: chain
+        :rtype: list
+        """
+        _l = []
+        num = int(num)
+        residue = residue.strip().upper()
+        chainlist = self.getchain_str()
+        s = self.read_struct()
+        for i in chainlist:
+            try:
+                resname_from_pdb = s[0][i][num].get_resname()
+                if residue == resname_from_pdb:
+                    _l.append(i)
+            except KeyError as e:
+                print(e,f'chain {i} not exist {residue}-{num}')
+        return _l
+
+    def _split_chain_save_struct(self,obj,sid):
+        io = PDBIO()
+        io.set_structure(obj)
+        sidc = sid.replace('.pdb', '') if '.pdb' in sid else sid
+        name,path = f'{sidc}_{obj.id}.pdb', self.create_path
+        io.save(os.path.join(path,'split_chain',name),ChainSelect(obj.id))
+
+    @mkdir(self = 'self',pname ='split_chain')
+    def split_chain(self,extract='auto'):
+        """split_chain [summary]
+        
+        [extended_summary]
+        
+        :param extract: [extract the chain], defaults to 'auto' extract all chains
+        :type extract: [string], optional
+        :raises ValueError: [save to pdb in current path in ./splitchain]
+        """
+        # 将传入的蛋白文件按照蛋白链分割并按照 pdbid_A_m.pdb 重新命名 A代表链，m表示该链含有小分子
+        p = PDBParser(PERMISSIVE=1)
+        s = p.get_structure(self.sid, self.path)
+        if extract == 'auto':
+            chainlist = self.getchain_str()
+            for j in chainlist:
+                self._split_chain_save_struct(obj=s[0][j],sid=self.sid)
+        else:
+            chainlist = self.getchain_str()
+            if extract not in chainlist:
+                raise ChainError(f'The {extract} chain not exists !')
+            if extract in chainlist:
+                extract = extract.upper()
+                self._split_chain_save_struct(obj=s[0][extract],sid=self.sid)
+            else:
+                raise ValueError
+
+    def read_struct(self):
+        """read_struct read pdb structure
+
+        https://biopython-cn.readthedocs.io/zh_CN/latest/cn/chr11.html#id3
+        一个 Structure 对象的整体布局遵循称为SMCRA（Structure/Model/Chain/Residue/Atom，结构/模型/链/残基/原子）的体系架构：
+
+        结构由模型组成
+        模型由多条链组成
+        链由残基组成
+        多个原子构成残基
+        Returns:
+            [obj] -- [Model]
+        """
+        p = PDBParser(PERMISSIVE=1)
+        s = p.get_structure(self.sid, self.path)
+        return s
+
+    def get_chain(self,chain_str):
+        s = self.read_struct()
+        return s[0][chain_str]
+
+    def init_path(self):
+        _p = return_path(self.path)
+        self.path = _p
+        if os.path.isfile(_p) == False:
+            raise FileNotFoundError(f'file {_p} not found')
+
+    def read_formula(self):
+        """read_formula read pdb file molecule information and identifier in this pdb file(just like residue)
+        
+        [extended_summary]
+        
+        :raises OSError: Do not find this file
+        """
+        _path = self.path
+        try:
+            with open(_path,'r') as f:
+                text_line = f.readlines()
+                _l = []
+                for i in text_line:
+                    li = i.split()
+                    if li[0] == 'FORMUL':
+                        _l.append(li)
+                        self.mole_id.append(li[2])
+                for ii in _l:
+                    self.mole_struct.append(''.join(ii[3:]))
+        except Exception as e:
+            raise e
+
+        def clean_pdb(self,outfile=None,path=None):
+            __lst = Bdp.read_line_list('ATOM',self.path)
+            _p = Path(path) if path else Path(self.path).absolute().parent
+            _f = outfile if outfile else f"{self.sid}.clean.pdb"
+            write_path = _p.joinpath(_f)
+            if write_path.exists(): write_path.unlink() 
+            with open(write_path,'w+') as f:
+                for i in __lst:
+                    f.write(f"{i}\n")
+
+def renamepdbfile(path):
+    """renamepdbfile 自动对path路径中批量下载的pdb中文件的文件名进行重命名
+
+    [extended_summary]
+
+    Arguments:
+        path {[type]} -- [description]
+    """
+    _flist = get_path_contents(path)[0]
+    for i in _flist:
+        i = i.strip()
+        basetuple = os.path.split(i)
+        basedir = basetuple[0]
+        _fn = basetuple[1]
+        res = re.match(pattern='pdb([A-Za-z0-9]*).ent',string=_fn)
+        # nfn = res.group(1)
+        # newname =os.path.join(basedir,str(nfn)+'.pbd')
+        # os.rename(src = i,dst = newname)
+        try:
+            nfn = res.group(1)
+            newname =os.path.join(basedir,str(nfn)+'.pdb')
+            os.rename(src = i,dst = newname)
+        except IndexError:
+            raise IndexError(f'Could not find pdbid in this filename: {i}')
+        except AttributeError:
+            continue
+        except FileNotFoundError as e:
+            if os.path.exists(i):
+                raise e
+            elif os.path.exists(newname):
+                continue
+            else:
+                raise e
+
+
+
+def pdbupdate(path):
+    pl = PDBList(pdb=path)
+    pl.update_pdb()
+
+