Files
pymol/script/pymol_plugin.py
2024-09-07 09:12:14 +08:00

498 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :auto.py
@Description: :自动处理脚本
@Date :2021/09/02 15:19:15
@Author :hotwa
@version :1.0
https://zhuanlan.zhihu.com/p/121215784 # PyMOL 选择器的语法参考
https://blog.csdn.net/u011450367/article/details/51815130
resource: https://www.cnblogs.com/wq242424/p/13073222.html
http://blog.sina.com.cn/s/blog_7188922f0100wbz1.html
https://www.jianshu.com/p/3396e94315cb
https://blog.csdn.net/dengximo9047/article/details/101221495 # 疏水表面
http://www.mdbbs.org/thread-4064-1-1.html # 用来寻找配体口袋的残基
'''
from functools import partial
from pymol import cmd
from pathlib import Path
import pandas as pd
from json import dumps
import numpy as np
from biotools import Bdp
from loguru import logger
# from types import FunctionType,CodeType
logger.add('pymol_plugin_{time}.log')
# select ligand, resn x
# cmd.select('ligand','byres 5UEH within 6 of resn GOL resn 85P')
def autoshow(i,path,distance = 6,ionshow = False):
"""autoshow 自动展示所有配体6A以内的lines方便查找共价键
[extended_summary] # cmd.create('pocket',f'byres {i} within {distance} of {rawstring}') # 创建一个在当前pdb对象中配体残基周围距离为6A的口袋对象
Arguments:
i {[string]} -- [pdbid 例如 5EA9]
Keyword Arguments:
path {[string]} -- [pdb文件存放的目录目前支持后缀为.pdb的文件也可以在全局变量中设置好文件目录] (default: {path})
distance {[int]} -- [显示周围原子的距离参数] (default: {6})
ionshow {[bool]} -- [离子和有机小分子周围共价结合观察使用,默认不展示] (default: {False})
Returns:
[list] -- [返回ligid标识符除去了部分离子]
"""
cmd.reinitialize()
p = Path(path)
file = p.joinpath(f"{i}.pdb")
cmd.load(file,i)
cmd.remove('solvent metals') # 移除金属离子和溶剂
mole = moleculeidentity(f"{i}",path)
rawstring = 'resn ' + ' resn '.join(mole.ligIdNoion)
print(f'{rawstring} around {distance}')
cmd.select('ligand',f'{rawstring}')
cmd.select('ligand_around',f'{rawstring} around {distance}') # 选择一个在当前pdb对象中配体残基周围距离为6A的口袋对象
if ionshow: # 是否显示所有记录小分子HET条记录中的信息对于离子与有机物显示相关共价键有效
cmd.show('lines','ligand_part') # 显示所有HET侧链
cmd.create('ligand_part',f'{rawstring} expand {distance}') # 单独显示小分子扩展6A周围的lines对象
cmd.create('organ',f'organic expand {distance}')
cmd.show('lines','organ')
return mole.ligId
def bejson(d:dict(help='need to beauty json')) -> dict:
return dumps(d,indent=4,ensure_ascii=False)
class covalentidentity():
"""covalentidentity 使用pymol进行识别共价键在输出有机小分子6A以内的原子时可能出现分子的构象发生改变导致共价键距离计算有问题还可能是与金属离子形成共价键
"""
# __slots__ = []
def __init__(self,pdbfilename,pdbid,path):
"""__init__ [summary]
[extended_summary]
Arguments:
pdbfilename {[string]} -- pdb文件名称
pdbid {[string]} -- 4 length id
path {[string or pathlib obj]} -- pdb文件路径
"""
self.pdbfilename = pdbfilename
self.pdbid = pdbid
self.path = Path(path)
self.init()
@staticmethod
def cleanpdb(before_string):
"""
处理传入参数pdb 例如: pdb1b12.ent 1b12.pdb 转化成 1b12
"""
# print(f'try to format {before_string}')
if '.ent' in before_string:
before_string = before_string[3:-4].lower()
elif '.pdb' in before_string:
before_string = before_string[:4].lower()
elif len(before_string) == 4:
before_string = before_string.lower()
else:
if len(before_string) != 4:
raise ValueError(f'length out of 4 {before_string}')
return before_string
@classmethod
def export_organ(cls,pdbfile,path,distance = 6,remove_water_metals = True):
# 输出小分子结合部分的共价结合信息对象pdb格式
file = path.joinpath(pdbfile)
_pdbid = covalentidentity.cleanpdb(pdbfile)
savepath = path.joinpath(f'{_pdbid}_organ_around_{distance}_lines.pdb')
cmd.reinitialize()
cmd.load(filename = file,object = pdbfile)
if remove_water_metals: cmd.remove('solvent metals') # 移除金属离子和溶剂
cmd.create('organ',f'organic expand {distance}')
cmd.show('lines','organ')
cmd.save(filename=savepath,selection='organ')
cls.connect_info_pdb_path = savepath
return savepath
def init(self):
self.__create_dataframe()
def __read_organ_pdb(self):
exportfile = covalentidentity.export_organ(self.pdbfilename,path = Path(self.path))
with open(file=exportfile,mode = 'r') as f:
read_list = [i.replace('\n', '') for i in f.readlines()]
self.read_connect = [i for i in read_list if i[:6].strip()=='CONECT']
self.read_hetatm = [i for i in read_list if i[:6].strip()=='HETATM']
self.read_atom = [i for i in read_list if i[:6].strip()=='ATOM']
def __create_dataframe(self):
self.__read_organ_pdb() # 读取初始化数据
connect_infos = []
for i in self.read_connect: #清洗数据
if len(i.split()) == 3: # 策略:查找只有两个原子键的连接信息
connect_infos.append(i)
self.df = self.transformtodataframe(self.read_hetatm).append(self.transformtodataframe(self.read_atom))
target_search_connect = []
for i in connect_infos:
l = [i for i in i.split() if i != 'CONECT']
target_search_connect.append(l)
self.target_search_connect = target_search_connect # 两个连接信息的列表,不一定都是共价键
def __search_convenlent(self,distance_accuracy=1):
true_covlent = [] # 共价键连接信息记录 有几个列表就有几个共价键链接信息
infos = []
locateinfos = {}
for item in self.target_search_connect: # 对两个连接原子进行判断
df = self.df
# molecule chain search
res0 = df[df['AtomNum']==item[0]]
res1 = df[df['AtomNum']==item[1]]
if (res0['Identifier'].all() == res1['Identifier'].all()):
continue
else:
true_covlent.append(item)
# 定位信息那个是小分子的行信息和那个是蛋白行的信息
if res0['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res0
if res0['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res0
if res1['Identifier'].all() == 'ATOM': locateinfos['ATOM'] = res1
if res1['Identifier'].all() == 'HETATM': locateinfos['HETATM'] = res1
assert (not res0.empty and not res1.empty),(
'this code occur a bug'
'try to connet to author fix it'
'maybe is pymol output connect infos error'
'never be happen'
)
partinfos = self.__fill_infos(i=item,pro=locateinfos['ATOM'],mole=locateinfos['HETATM'],distance_accuracy=distance_accuracy)
infos.append(partinfos)
return infos
def convenlent_infos(self,distance_accuracy=1):
return self.__search_convenlent(distance_accuracy=distance_accuracy)
@property
def cov_infos(self):
d = self.__search_convenlent()
return d
@staticmethod
def __fill_infos(i,pro,mole,distance_accuracy=1):
distance = np.sqrt(np.square(float(pro['X'].all())-float(mole['X'].all())) + np.square(float(pro['Y'].all())-float(mole['Y'].all())) + np.square(float(pro['Z'].all())-float(mole['Z'].all())))
infostmplate = {
'ConnectInfos': i,
'ResidueName':pro['ResidueName'].all(),
'ResidueSequence':pro['ResidueSequence'].all(),
'ChainIndentifier':pro['ChainIndentifier'].all(),
'CovenlentAtom':{
'protein': {
'AtomName':pro['AtomName'].all(),
'ElementSymbol':pro['ElementSymbol'].all(),
},
'molecule':{
'AtomName':mole['AtomName'].all(),
'ElementSymbol':mole['ElementSymbol'].all(),
},
},
'CovenlentDistance':round(distance,distance_accuracy),
'LigandName':mole['ResidueName'].all(),
}
return infostmplate
@staticmethod
def transformtodataframe(readlinecontent=None,first_label = 'ATOM',path = False):
path_info = (Path(path).name.__str__(),Path(path).parent.__str__()) if path else ('unknown file name','unknown path')
if path: readlinecontent = Bdp.read_line_list(first_column=first_label,path=path)
if readlinecontent == None: raise ValueError('readlinecontent need')
if first_label == ('ATOM' or 'HETATM'): # ! 注意python中的懒惰运算及运算符特性
# 将pdb每行读取的信息转化为dataframe 针对ATOM HETAM开头的行适用
Identifier,AtomNum,AtomName,AtomLocationIndicator,ResidueName,ChainIndentifier,ResidueSequence,InsertionsResidue,X,Y,Z,Occupancy,Tfactor,SegmentIdentifier,ElementSymbol = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
for i in readlinecontent:
Identifier.append(i[:7].strip())
AtomNum.append(i[6:11].strip())
AtomName.append(i[12:16].strip())
AtomLocationIndicator.append(i[16].strip())
ResidueName.append(i[17:20].strip())
ChainIndentifier.append(i[21].strip())
ResidueSequence.append(i[22:26].strip())
InsertionsResidue.append(i[26].strip())
X.append(i[30:38].strip())
Y.append(i[38:46].strip())
Z.append(i[46:54].strip())
Occupancy.append(i[54:60].strip())
Tfactor.append(i[60:66].strip())
SegmentIdentifier.append(i[72:76].strip())
ElementSymbol.append(i[76:78].strip())
df = pd.DataFrame(data={
'Identifier':Identifier,
'AtomNum':AtomNum,
'AtomName':AtomName,
'AtomLocationIndicator':AtomLocationIndicator,
'ResidueName':ResidueName,
'ChainIndentifier':ChainIndentifier,
'ResidueSequence':ResidueSequence,
'InsertionsResidue':InsertionsResidue,
'X':X,
'Y':Y,
'Z':Z,
'Occupancy':Occupancy,
'Tfactor':Tfactor,
'SegmentIdentifier':SegmentIdentifier,
'ElementSymbol':ElementSymbol
})
elif first_label == 'LINK':
# https://www.wwpdb.org/documentation/file-format-content/format33/sect6.html
Record_name,Atom_name1,Alternate_location_indicator1,Residue_name1,Chain_identifier1,Residue_sequence_number1,Insertion_code1,Atom_name2,Alternate_location_indicator2,Residue_name2,Chain_identifier2,Residue_sequence_number2,Insertion_code2,Symmetry_operator_atom_1,Symmetry_operator_atom_2,Link_distance = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]
for i in readlinecontent:
if len(i) != 78:
logger.exception(
f"[LINK label length error] not match the LINK line length\n"
f"[input string]:{i}\n"
f"check your input file:{path_info[0]} from {path_info[1]}\n"
"if this line not have link distance, try to caculate by covalent.bypymol method")
pass
else:
Record_name.append(i[:6].strip())
Atom_name1.append(i[12:16].strip())
Alternate_location_indicator1.append(i[16].strip())
Residue_name1.append(i[17:20].strip())
Chain_identifier1.append(i[21].strip())
Residue_sequence_number1.append(i[22:26].strip())
Insertion_code1.append(i[26].strip())
Atom_name2.append(i[42:46].strip())
Alternate_location_indicator2.append(i[46].strip())
Residue_name2.append(i[47:50].strip())
Chain_identifier2.append(i[51].strip())
Residue_sequence_number2.append(i[52:56].strip())
Insertion_code2.append(i[56].strip())
Symmetry_operator_atom_1.append(i[59:65].strip())
Symmetry_operator_atom_2.append(i[66:72].strip())
Link_distance.append(i[73:78].strip())
df = pd.DataFrame(data={
'Record_name':Record_name,
'Atom_name1':Atom_name1,
'Alternate_location_indicator1':Alternate_location_indicator1,
'Residue_name1':pd.Series(Residue_name1,dtype='object'),
'Chain_identifier1':Chain_identifier1,
'Residue_sequence_number1':Residue_sequence_number1,
'Insertion_code1':Insertion_code1,
'Atom_name2':Atom_name2,
'Alternate_location_indicator2':Alternate_location_indicator2,
'Residue_name2':pd.Series(Residue_name2,dtype='object'),
'Chain_identifier2':Chain_identifier2,
'Residue_sequence_number2':Residue_sequence_number2,
'Insertion_code2':Insertion_code2,
'Symmetry_operator_atom_1':Symmetry_operator_atom_1,
'Symmetry_operator_atom_2':Symmetry_operator_atom_2,
'Link_distance':pd.Series(Link_distance,dtype='float32'),
})
else:
assert False,(
'No return',
'a error occurred'
)
return df
class color_plan():
color1 = (('color1', '[186,182,217]'), 'purple')
color2 = (('color2', '[233,195,153]'), 'yellow')
color3 = (('color3', '[141,215,247]'), 'blue')
color4 = (('color4', '[206,155,198]'), 'purple')
color5 = (('color5', '[251,187,62]'), 'orange')
color6 = (('color6', '[245,157,158]'), 'red')
color7 = (('color7', '[133,188,135]'), 'green')
colors = (color1, color2, color3, color4, color5, color6, color7)
@staticmethod
def defcolor():
color_gennerate = map(lambda x:x[0],color_plan.colors)
list(map(lambda x:cmd.set_color(*x),color_gennerate))
# error class
class PathError(BaseException):
def __init__(self,arg):
self.arg = arg
class moleculeidentity():
"""moleculeidentity [summary]
[识别pdb蛋白文件中的小分子标识符]
"""
def __init__(self,pdbfile,path):
self.pathstr = path
self.path = Path(path)
self.pdbfile = pdbfile
self._init()
def _init(self):
if not self.path.exists():
raise PathError('path not exist')
if not isinstance(self.pdbfile,str):
raise TypeError('Pdbid must be a string!')
if ('.pdb' not in self.pdbfile) and (len(self.pdbfile) != 4):
raise TypeError('Pdbid must be 4 letters')
if ('.pdb' or '.PDB') in self.pdbfile:
raise TypeError(f'{self.pdbfile} Remove ".pdb" from input arg, add automatically')
file_list = list(self.path.glob('*.pdb'))
self.path_parent = file_list[0].parent
self.pdbfilelist = [i.name[:4].upper() for i in file_list]
def __parse_pdb_ligid(self,ion=True):
if self.pdbfile.upper() not in self.pdbfilelist:
raise FileNotFoundError(f'not found {self.pdbfile} in {self.path}')
infos_line = []
ligId = []
for i in self.__generate_pdb_lines():
if self.check_line_header(i) == 'HET':
infos_line.append(i)
ligId = [i.split()[1] for i in infos_line]
ligId = list(set(ligId))
if not ion: ligId = [i for i in ligId if len(i) == 3 ] # remove ion from list
return ligId
@property
def ligId(self):
# return ligId include ion
return self.__parse_pdb_ligid()
@property
def ligIdNoion(self):
return self.__parse_pdb_ligid(ion=False)
@staticmethod
def check_line_header(line_text):
return line_text[0:6].strip()
def __generate_pdb_lines(self):
openpdbfile = self.pdbfile + '.pdb' if '.pdb' not in self.pdbfile else self.pdbfile
for row in open(self.path_parent.joinpath(openpdbfile),'r+'):
yield row.strip()
class covalent(object):
def __init__(self,pdbfilename,path,pdbid):
self.pdbfilename = pdbfilename
self.pdbid = covalentidentity.cleanpdb(pdbid)
self.path_parent = Path(path)
self.path = self.path_parent.joinpath(pdbfilename)
self._init()
def _init(self):
self.link_df = covalentidentity.transformtodataframe(path = self.path,first_label='LINK')
@classmethod
def bypdb(cls,pdbfilename,path,pdbid):
return cls(pdbfilename=pdbfilename,path=path,pdbid=pdbid)
@staticmethod
def bypymol(pdbfilename,pdbid,path):
return covalentidentity(pdbfilename,pdbid,path)
@property
def mole_id(self)->list:
_instance=Bdp(path=self.path,sid=self.pdbid)
return [i for i in _instance.mole_id if len(i)==3 and i != 'HOH' and i != 'SO4' and i != 'PO4'] # ! 在这里手动剔除掉硫酸根和磷酸根
def modresdata(self)->list:
"""modresdata 标准残疾的修饰信息
"""
modres_lst = Bdp.read_line_list(first_column='MODRES',path=self.path)
lst = [i[12:15] for i in modres_lst] # 切片出修饰残基的小分子
return list(set(lst))
def get_covalent(self):
# 从原始的pdb文件中可获取共价键信息 # ? 对其中的信息正确率为100% 来自原始的pdb文件中的信息
res1 = self.link_df.query(f'Residue_name1 in {self.remove_modres_moleid}')
res2 = self.link_df.query(f'Residue_name2 in {self.remove_modres_moleid}')
df = pd.merge(res1, res2,how='outer')
df.astype('object')
# remove ions link with ligand
df = df[df['Residue_name1'].str.len()==3]
df = df[df['Residue_name2'].str.len()==3]
return df
@property
def remove_modres_moleid(self)->list:
# 清除link信息中的modres即修饰残基的ligid
# lst = Bdp.read_line_list(first_column='LINK',path=self.path)
# lst = list(set(map(lambda s:s[17:20],lst)))
clean_func = partial(self.func_dynamic_data_template,dynamic_data=self.ModifiedResidues)
res = clean_func(origin_data=self.mole_id)
return list(filter(lambda x:len(x.strip())==3 and x != 'HOH',res))
@property
def ModifiedResidues(self):
ModifiedResiduesId = []
for i in self.mole_id:
if i in self.modresdata():
ModifiedResiduesId.append(i)
# 从SEQRES序列中获取MODRES的修饰残基的小分子仅仅从SEQRES有时获取不全面
seqres_lst = [i[19:].strip() for i in Bdp.read_line_list(first_column='SEQRES',path = self.path)]
seqres_lst_string = '\r\n'.join(seqres_lst)
seqres_lst_set = set(list(i for line in seqres_lst_string.splitlines() for i in line.split()))
seqres_list = seqres_lst_set.intersection(set(self.mole_id))
ModifiedResiduesId.extend(seqres_list)
return list(set(ModifiedResiduesId))
@staticmethod
def func_dynamic_data_template(origin_data:list,dynamic_data:list)->list:
# 取origin_data和dynamic_data补集
origin_data_set = set(origin_data)
dynamic_data_set = set(dynamic_data)
_data = origin_data_set.difference(dynamic_data_set)
return _data
cmd.extend('autoshow', autoshow) # pymol中自定义autoshow函数请使用cmd.autoshow()命令
#! 下面为具体使用查找共价键的代码
# def read_link_lines(item):
# ins1 = covalent.bypdb(pdbfilename = item,path='M:\program\\autotask\search_res1_pdb',pdbid = item[3:7])
# res = ins1.get_covalent()
# res['Pdb_id'] = [item[3:7] for _ in range(len(res))]
# return res
def return_mole(series):
# LINK 字段信息进行分析解读
residue = [i.upper() for i in 'Gly,Ala,Val,Leu,Ile,Pro,Phe,Tyr,Trp,Ser,Thr,Cys,Met,Asn,Gln,Asp,Glu,Lys,Arg,His'.split(',')]
if str(series[1]['Residue_name1']) in residue:
if str(series[1]['Residue_name2']) in residue:
# 除去两个氨基酸连接的情况
return pd.DataFrame()
else:
_data = series[1]['Residue_name2'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier2']
_df = pd.DataFrame(data={
'LigandName': pd.Series([_data[0]],dtype='string'),
'Link_distance': pd.Series([_data[1]],dtype='float32'),
'Pdb_id': pd.Series([_data[2]],dtype='string'),
'Chain_identifier': pd.Series([_data[3]],dtype='string')
})
return _df
elif str(series[1]['Residue_name2']) in residue:
if str(series[1]['Residue_name1']) in residue:
# 除去两个氨基酸连接的情况
return pd.DataFrame()
else:
_data = series[1]['Residue_name1'],series[1]['Link_distance'],series[1]['Pdb_id'],series[1]['Chain_identifier1']
_df = pd.DataFrame(data={
'LigandName': pd.Series([_data[0]],dtype='string'),
'Link_distance': pd.Series([_data[1]],dtype='float32'),
'Pdb_id': pd.Series([_data[2]],dtype='string'),
'Chain_identifier': pd.Series([_data[3]],dtype='string')
})
return _df
else:
logger.exception(
f'[MoleculeMatchError] Pdbid {series[1]["Pdb_id"]} occur a error\n'
f'出现了两个残基名称都不属于常见的20中\n{residue}\n'
f'source: {series[1]}'
)
return pd.DataFrame() # 两个小分子连接的情况,去除
if __name__ == '__main__':
...