pymol/script/pymol_plugin.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file        :auto.py
@Description:       :自动处理脚本
@Date     :2021/09/02 15:19:15
@Author      :hotwa
@version      :1.0
https://zhuanlan.zhihu.com/p/121215784 # PyMOL 选择器的语法参考
https://blog.csdn.net/u011450367/article/details/51815130
resource: https://www.cnblogs.com/wq242424/p/13073222.html
http://blog.sina.com.cn/s/blog_7188922f0100wbz1.html
https://www.jianshu.com/p/3396e94315cb
https://blog.csdn.net/dengximo9047/article/details/101221495 # 疏水表面
http://www.mdbbs.org/thread-4064-1-1.html # 用来寻找配体口袋的残基
'''

from functools import partial
from pymol import cmd
from pathlib import Path
import pandas as pd
from json import dumps
import numpy as np
from biotools import Bdp
from loguru import logger
# from types import FunctionType,CodeType

logger.add('pymol_plugin_{time}.log')

# select ligand, resn x
# cmd.select('ligand','byres 5UEH within 6 of resn GOL resn 85P')
def autoshow(i,path,distance = 6,ionshow = False):
    """autoshow 自动展示所有配体6A以内的lines，方便查找共价键

    [extended_summary] # cmd.create('pocket',f'byres {i} within {distance} of {rawstring}') # 创建一个在当前pdb对象中配体残基周围距离为6A的口袋对象

    Arguments:
        i {[string]} -- [pdbid 例如 5EA9]

    Keyword Arguments:
        path {[string]} -- [pdb文件存放的目录，目前支持后缀为.pdb的文件，也可以在全局变量中设置好文件目录] (default: {path})
        distance {[int]} -- [显示周围原子的距离参数] (default: {6})
        ionshow {[bool]} -- [离子和有机小分子周围共价结合观察使用，默认不展示] (default: {False})

    Returns:
        [list] -- [返回ligid标识符，除去了部分离子]
    """
    cmd.reinitialize()
    p = Path(path)
    file = p.joinpath(f"{i}.pdb")
    cmd.load(file,i)
    cmd.remove('solvent metals') # 移除金属离子和溶剂
    mole = moleculeidentity(f"{i}",path)
    rawstring = 'resn ' + ' resn '.join(mole.ligIdNoion)
    print(f'{rawstring} around {distance}')
    cmd.select('ligand',f'{rawstring}')
    cmd.select('ligand_around',f'{rawstring} around {distance}') # 选择一个在当前pdb对象中配体残基周围距离为6A的口袋对象
    if ionshow: # 是否显示所有记录小分子HET条记录中的信息，对于离子与有机物显示相关共价键有效
        cmd.show('lines','ligand_part') # 显示所有HET侧链
        cmd.create('ligand_part',f'{rawstring} expand {distance}') # 单独显示小分子扩展6A周围的lines对象
    cmd.create('organ',f'organic expand {distance}')
    cmd.show('lines','organ')
    return mole.ligId

def bejson(d:dict(help='need to beauty json')) -> dict:
    return dumps(d,indent=4,ensure_ascii=False)

class covalentidentity():
    """covalentidentity 使用pymol进行识别共价键，在输出有机小分子6A以内的原子时可能出现分子的构象发生改变导致共价键距离计算有问题，还可能是与金属离子形成共价键
    """
    # __slots__ = []

    def __init__(self,pdbfilename,pdbid,path):
        """__init__ [summary]

        [extended_summary]

        Arguments:
            pdbfilename {[string]} -- pdb文件名称
            pdbid {[string]} -- 4 length id
            path {[string or pathlib obj]} -- pdb文件路径
        """
        self.pdbfilename = pdbfilename
        self.pdbid = pdbid
        self.path = Path(path)
        self.init()

    @staticmethod
    def cleanpdb(before_string):
        """
        处理传入参数pdb 例如: pdb1b12.ent 1b12.pdb 转化成 1b12
        """
        # print(f'try to format {before_string}')
        if '.ent' in before_string:
            before_string = before_string[3:-4].lower()
        elif '.pdb' in before_string:
            before_string = before_string[:4].lower()
        elif len(before_string) == 4:
            before_string = before_string.lower()
        else:
            if len(before_string) != 4:
                raise ValueError(f'length out of 4 {before_string}')
        return before_string

    @classmethod
    def export_organ(cls,pdbfile,path,distance = 6,remove_water_metals = True):
        # 输出小分子结合部分的共价结合信息对象，pdb格式
        file = path.joinpath(pdbfile)
        _pdbid = covalentidentity.cleanpdb(pdbfile)
        savepath = path.joinpath(f'{_pdbid}_organ_around_{distance}_lines.pdb')
        cmd.reinitialize()
        cmd.load(filename = file,object = pdbfile)
        if remove_water_metals: cmd.remove('solvent metals') # 移除金属离子和溶剂
        cmd.create('organ',f'organic expand {distance}')
        cmd.show('lines','organ')
        cmd.save(filename=savepath,selection='organ')
        cls.connect_info_pdb_path = savepath
        return savepath

    def init(self):
        self.__create_dataframe()

    def __read_organ_pdb(self):
        exportfile = covalentidentity.export_organ(self.pdbfilename,path = Path(self.path))
        with open(file=exportfile,mode = 'r') as f:
            read_list = [i.replace('\n', '') for i in f.readlines()]
            self.read_connect = [i for i in read_list if i[:6].strip()=='CONECT']
            self.read_hetatm = [i for i in read_list if i[:6].strip()=='HETATM']
            self.read_atom = [i for i in read_list if i[:6].strip()=='ATOM']

    def __create_dataframe(self):
        self.__read_organ_pdb() # 读取初始化数据
        connect_infos = []
        for i in self.read_connect: #清洗数据
            if len(i.split()) == 3: # 策略：查找只有两个原子键的连接信息
                connect_infos.append(i)
        self.df = self.transformtodataframe(self.read_hetatm).append(self.transformtodataframe(self.read_atom))
        target_search_connect = []
        for i in connect_infos:
            l = [i for i in i.split() if i != 'CONECT']
            target_search_connect.append(l)
        self.target_search_connect = target_search_connect # 两个连接信息的列表，不一定都是共价键

    def __search_convenlent(self,distance_accuracy=1):
        true_covlent = [] # 共价键连接信息记录 有几个列表就有几个共价键链接信息
        infos = []
        locateinfos = {}
        for item in self.target_search_connect: # 对两个连接原子进行判断
            df = self.df
            # molecule chain search
            res0 = df[df['AtomNum']==item[0]]
            res1 = df[df['AtomNum']==item[1]]
            if (res0['Identifier'].all() == res1['Identifier'].all()):
                continue
            else:
                true_covlent.append(item)
                # 定位信息那个是小分子的行信息和那个是蛋白行的信息
                if res0['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res0
                if res0['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res0
                if res1['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res1
                if res1['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res1
                assert (not res0.empty and not res1.empty),(
                    'this code occur a bug'
                    'try to connet to author fix it'
                    'maybe is pymol output connect infos error'
                    'never be happen'
                )
            partinfos = self.__fill_infos(i=item,pro=locateinfos['ATOM'],mole=locateinfos['HETATM'],distance_accuracy=distance_accuracy)
            infos.append(partinfos)
        return infos

    def convenlent_infos(self,distance_accuracy=1):
        return self.__search_convenlent(distance_accuracy=distance_accuracy)

    @property
    def cov_infos(self):
        d = self.__search_convenlent()
        return d

    @staticmethod
    def __fill_infos(i,pro,mole,distance_accuracy=1):
        distance = np.sqrt(np.square(float(pro['X'].all())-float(mole['X'].all())) + np.square(float(pro['Y'].all())-float(mole['Y'].all())) + np.square(float(pro['Z'].all())-float(mole['Z'].all())))
        infostmplate = {
            'ConnectInfos': i,
            'ResidueName':pro['ResidueName'].all(),
            'ResidueSequence':pro['ResidueSequence'].all(),
            'ChainIndentifier':pro['ChainIndentifier'].all(),
            'CovenlentAtom':{
                'protein': {
                    'AtomName':pro['AtomName'].all(),
                    'ElementSymbol':pro['ElementSymbol'].all(),
                },
                'molecule':{
                    'AtomName':mole['AtomName'].all(),
                    'ElementSymbol':mole['ElementSymbol'].all(),
                },
            },
            'CovenlentDistance':round(distance,distance_accuracy),
            'LigandName':mole['ResidueName'].all(),
        }
        return infostmplate

    @staticmethod
    def transformtodataframe(readlinecontent=None,first_label = 'ATOM',path = False):
        path_info = (Path(path).name.__str__(),Path(path).parent.__str__()) if path else ('unknown file name','unknown path')
        if path: readlinecontent = Bdp.read_line_list(first_column=first_label,path=path)
        if readlinecontent == None: raise ValueError('readlinecontent need')
        if first_label == ('ATOM' or 'HETATM'): # ! 注意python中的懒惰运算，及运算符特性
            # 将pdb每行读取的信息转化为dataframe 针对ATOM HETAM开头的行适用
            Identifier,AtomNum,AtomName,AtomLocationIndicator,ResidueName,ChainIndentifier,ResidueSequence,InsertionsResidue,X,Y,Z,Occupancy,Tfactor,SegmentIdentifier,ElementSymbol = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
            for i in readlinecontent:
                Identifier.append(i[:7].strip())
                AtomNum.append(i[6:11].strip())
                AtomName.append(i[12:16].strip())
                AtomLocationIndicator.append(i[16].strip())
                ResidueName.append(i[17:20].strip())
                ChainIndentifier.append(i[21].strip())
                ResidueSequence.append(i[22:26].strip())
                InsertionsResidue.append(i[26].strip())
                X.append(i[30:38].strip())
                Y.append(i[38:46].strip())
                Z.append(i[46:54].strip())
                Occupancy.append(i[54:60].strip())
                Tfactor.append(i[60:66].strip())
                SegmentIdentifier.append(i[72:76].strip())
                ElementSymbol.append(i[76:78].strip())
            df = pd.DataFrame(data={
                'Identifier':Identifier,
                'AtomNum':AtomNum,
                'AtomName':AtomName,
                'AtomLocationIndicator':AtomLocationIndicator,
                'ResidueName':ResidueName,
                'ChainIndentifier':ChainIndentifier,
                'ResidueSequence':ResidueSequence,
                'InsertionsResidue':InsertionsResidue,
                'X':X,
                'Y':Y,
                'Z':Z,
                'Occupancy':Occupancy,
                'Tfactor':Tfactor,
                'SegmentIdentifier':SegmentIdentifier,
                'ElementSymbol':ElementSymbol
            })
        elif first_label == 'LINK':
            # https://www.wwpdb.org/documentation/file-format-content/format33/sect6.html
            Record_name,Atom_name1,Alternate_location_indicator1,Residue_name1,Chain_identifier1,Residue_sequence_number1,Insertion_code1,Atom_name2,Alternate_location_indicator2,Residue_name2,Chain_identifier2,Residue_sequence_number2,Insertion_code2,Symmetry_operator_atom_1,Symmetry_operator_atom_2,Link_distance = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
            for i in readlinecontent:
                if len(i) != 78:
                    logger.exception(
                    f"[LINK label length error] not match the LINK line length\n"
                    f"[input string]:{i}\n"
                    f"check your input file:{path_info[0]} from {path_info[1]}\n"
                    "if this line not have link distance, try to caculate by covalent.bypymol method")
                    pass
                else:
                    Record_name.append(i[:6].strip())
                    Atom_name1.append(i[12:16].strip())
                    Alternate_location_indicator1.append(i[16].strip())
                    Residue_name1.append(i[17:20].strip())
                    Chain_identifier1.append(i[21].strip())
                    Residue_sequence_number1.append(i[22:26].strip())
                    Insertion_code1.append(i[26].strip())
                    Atom_name2.append(i[42:46].strip())
                    Alternate_location_indicator2.append(i[46].strip())
                    Residue_name2.append(i[47:50].strip())
                    Chain_identifier2.append(i[51].strip())
                    Residue_sequence_number2.append(i[52:56].strip())
                    Insertion_code2.append(i[56].strip())
                    Symmetry_operator_atom_1.append(i[59:65].strip())
                    Symmetry_operator_atom_2.append(i[66:72].strip())
                    Link_distance.append(i[73:78].strip())
            df = pd.DataFrame(data={
                'Record_name':Record_name,
                'Atom_name1':Atom_name1,
                'Alternate_location_indicator1':Alternate_location_indicator1,
                'Residue_name1':pd.Series(Residue_name1,dtype='object'),
                'Chain_identifier1':Chain_identifier1,
                'Residue_sequence_number1':Residue_sequence_number1,
                'Insertion_code1':Insertion_code1,
                'Atom_name2':Atom_name2,
                'Alternate_location_indicator2':Alternate_location_indicator2,
                'Residue_name2':pd.Series(Residue_name2,dtype='object'),
                'Chain_identifier2':Chain_identifier2,
                'Residue_sequence_number2':Residue_sequence_number2,
                'Insertion_code2':Insertion_code2,
                'Symmetry_operator_atom_1':Symmetry_operator_atom_1,
                'Symmetry_operator_atom_2':Symmetry_operator_atom_2,
                'Link_distance':pd.Series(Link_distance,dtype='float32'),
            })
        else:
            assert False,(
                'No return',
                'a error occurred'
            )
        return df

class color_plan():
    color1 = (('color1', '[186,182,217]'), 'purple')
    color2 = (('color2', '[233,195,153]'), 'yellow')
    color3 = (('color3', '[141,215,247]'), 'blue')
    color4 = (('color4', '[206,155,198]'), 'purple')
    color5 = (('color5', '[251,187,62]'), 'orange')
    color6 = (('color6', '[245,157,158]'), 'red')
    color7 = (('color7', '[133,188,135]'), 'green')
    colors = (color1, color2, color3, color4, color5, color6, color7)

    @staticmethod
    def defcolor():
        color_gennerate = map(lambda x:x[0],color_plan.colors)
        list(map(lambda x:cmd.set_color(*x),color_gennerate))

# error class
class PathError(BaseException):
    def __init__(self,arg):
        self.arg = arg

class moleculeidentity():
    """moleculeidentity [summary]

    [识别pdb蛋白文件中的小分子标识符]
    """

    def __init__(self,pdbfile,path):
        self.pathstr = path
        self.path = Path(path)
        self.pdbfile = pdbfile
        self._init()

    def _init(self):
        if not self.path.exists():
            raise PathError('path not exist')
        if not isinstance(self.pdbfile,str):
            raise TypeError('Pdbid must be a string!')
        if ('.pdb' not in self.pdbfile) and (len(self.pdbfile) != 4):
            raise TypeError('Pdbid must be 4 letters')
        if ('.pdb' or '.PDB') in self.pdbfile:
            raise TypeError(f'{self.pdbfile} Remove ".pdb" from input arg, add automatically')
        file_list =  list(self.path.glob('*.pdb'))
        self.path_parent = file_list[0].parent
        self.pdbfilelist = [i.name[:4].upper() for i in file_list]

    def __parse_pdb_ligid(self,ion=True):
        if self.pdbfile.upper() not in self.pdbfilelist:
            raise FileNotFoundError(f'not found {self.pdbfile} in {self.path}')
        infos_line = []
        ligId = []
        for i in self.__generate_pdb_lines():
            if self.check_line_header(i) == 'HET':
                infos_line.append(i)
        ligId = [i.split()[1] for i in infos_line]
        ligId = list(set(ligId))
        if not ion: ligId = [i for i in ligId if len(i) == 3 ] # remove ion from list
        return ligId

    @property
    def ligId(self):
        # return ligId include ion
        return self.__parse_pdb_ligid()

    @property
    def ligIdNoion(self):
        return self.__parse_pdb_ligid(ion=False)

    @staticmethod
    def check_line_header(line_text):
        return line_text[0:6].strip()

    def __generate_pdb_lines(self):
        openpdbfile = self.pdbfile + '.pdb' if '.pdb' not in self.pdbfile else self.pdbfile
        for row in open(self.path_parent.joinpath(openpdbfile),'r+'):
            yield row.strip()

class covalent(object):

    def __init__(self,pdbfilename,path,pdbid):
        self.pdbfilename = pdbfilename
        self.pdbid = covalentidentity.cleanpdb(pdbid)
        self.path_parent = Path(path)
        self.path = self.path_parent.joinpath(pdbfilename)
        self._init()

    def _init(self):
        self.link_df = covalentidentity.transformtodataframe(path = self.path,first_label='LINK')

    @classmethod
    def bypdb(cls,pdbfilename,path,pdbid):
        return cls(pdbfilename=pdbfilename,path=path,pdbid=pdbid)

    @staticmethod
    def bypymol(pdbfilename,pdbid,path):
        return covalentidentity(pdbfilename,pdbid,path)

    @property
    def mole_id(self)->list:
        _instance=Bdp(path=self.path,sid=self.pdbid)
        return [i for i in _instance.mole_id if len(i)==3 and i != 'HOH' and i != 'SO4' and i != 'PO4'] # ! 在这里手动剔除掉硫酸根和磷酸根

    def modresdata(self)->list:
        """modresdata 标准残疾的修饰信息
        """
        modres_lst = Bdp.read_line_list(first_column='MODRES',path=self.path)
        lst = [i[12:15] for i in modres_lst] # 切片出修饰残基的小分子
        return list(set(lst))

    def get_covalent(self):
        # 从原始的pdb文件中可获取共价键信息 # ? 对其中的信息正确率为100% 来自原始的pdb文件中的信息
        res1 = self.link_df.query(f'Residue_name1 in {self.remove_modres_moleid}')
        res2 = self.link_df.query(f'Residue_name2 in {self.remove_modres_moleid}')
        df = pd.merge(res1, res2,how='outer')
        df.astype('object')
        # remove ions link with ligand
        df = df[df['Residue_name1'].str.len()==3]
        df = df[df['Residue_name2'].str.len()==3]
        return df


    @property
    def remove_modres_moleid(self)->list:
        # 清除link信息中的modres，即修饰残基的ligid
        # lst = Bdp.read_line_list(first_column='LINK',path=self.path)
        # lst = list(set(map(lambda s:s[17:20],lst)))
        clean_func = partial(self.func_dynamic_data_template,dynamic_data=self.ModifiedResidues)
        res = clean_func(origin_data=self.mole_id)
        return list(filter(lambda x:len(x.strip())==3 and x != 'HOH',res))

    @property
    def ModifiedResidues(self):
        ModifiedResiduesId = []
        for i in self.mole_id:
            if i in self.modresdata():
                ModifiedResiduesId.append(i)
        # 从SEQRES序列中获取MODRES的修饰残基的小分子，仅仅从SEQRES有时获取不全面
        seqres_lst = [i[19:].strip() for i in Bdp.read_line_list(first_column='SEQRES',path = self.path)]
        seqres_lst_string = '\r\n'.join(seqres_lst)
        seqres_lst_set = set(list(i for line in seqres_lst_string.splitlines() for i in line.split()))
        seqres_list = seqres_lst_set.intersection(set(self.mole_id))
        ModifiedResiduesId.extend(seqres_list)
        return list(set(ModifiedResiduesId))

    @staticmethod
    def func_dynamic_data_template(origin_data:list,dynamic_data:list)->list:
        # 取origin_data和dynamic_data补集
        origin_data_set = set(origin_data)
        dynamic_data_set = set(dynamic_data)
        _data = origin_data_set.difference(dynamic_data_set)
        return _data


cmd.extend('autoshow', autoshow) # pymol中自定义autoshow函数，请使用cmd.autoshow()命令

#! 下面为具体使用查找共价键的代码
# def read_link_lines(item):
#     ins1 = covalent.bypdb(pdbfilename = item,path='M:\program\\autotask\search_res1_pdb',pdbid = item[3:7])
#     res = ins1.get_covalent()
#     res['Pdb_id'] = [item[3:7] for _ in range(len(res))]
#     return res

def return_mole(series):
    # LINK 字段信息进行分析解读
    residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')]
    if str(series[1]['Residue_name1']) in residue:
        if str(series[1]['Residue_name2']) in residue:
            # 除去两个氨基酸连接的情况
            return pd.DataFrame()
        else:
            _data =  series[1]['Residue_name2'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier2']
            _df = pd.DataFrame(data={
            'LigandName': pd.Series([_data[0]],dtype='string'),
            'Link_distance': pd.Series([_data[1]],dtype='float32'),
            'Pdb_id': pd.Series([_data[2]],dtype='string'),
            'Chain_identifier': pd.Series([_data[3]],dtype='string')
        })
            return _df
    elif str(series[1]['Residue_name2']) in residue:
        if str(series[1]['Residue_name1']) in residue:
            # 除去两个氨基酸连接的情况
            return pd.DataFrame()
        else:
            _data =  series[1]['Residue_name1'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier1']
            _df = pd.DataFrame(data={
            'LigandName': pd.Series([_data[0]],dtype='string'),
            'Link_distance': pd.Series([_data[1]],dtype='float32'),
            'Pdb_id': pd.Series([_data[2]],dtype='string'),
            'Chain_identifier': pd.Series([_data[3]],dtype='string')
        })
            return _df
    else:
        logger.exception(
            f'[MoleculeMatchError] Pdbid {series[1]["Pdb_id"]} occur a error\n'
            f'出现了两个残基名称都不属于常见的20中\n{residue}\n'
            f'source: {series[1]}'
        )
        return pd.DataFrame() # 两个小分子连接的情况，去除

if __name__ == '__main__':
    ...