386 lines
19 KiB
Python
Executable File
386 lines
19 KiB
Python
Executable File
#-------------------------------------------------------------------------------
|
|
# Name: module1
|
|
# Purpose:
|
|
#
|
|
# Author: zinph
|
|
#
|
|
# Created: 01/11/2016
|
|
# Copyright: (c) zinph 2016
|
|
# Licence: <your licence>
|
|
#-------------------------------------------------------------------------------
|
|
|
|
import os
|
|
import re
|
|
import os.path
|
|
import datetime
|
|
from molvs import standardize_smiles
|
|
from random import *
|
|
from rdkit import Chem
|
|
from rdkit.Chem import Descriptors
|
|
from rdkit.Chem import AllChem
|
|
from itertools import *
|
|
from collections import Counter, defaultdict
|
|
from rdkit.Chem.Scaffolds import MurckoScaffold
|
|
from operator import itemgetter
|
|
|
|
class SIME:
|
|
|
|
def __init__(self, structural_motifs_file, sugars_file, max_repeat_motifs, minimal_sugars, library_size, enumerate_all_SMs, enumerate_all_sugars):
|
|
|
|
## self.info = {} # to record number of compounds for each length
|
|
self.total_numcompounds = 0
|
|
self.structural_motifs_file = structural_motifs_file
|
|
self.sugars_file = sugars_file
|
|
self.max_repeat_motifs = max_repeat_motifs
|
|
self.minimal_sugars = minimal_sugars
|
|
self.library_size = library_size
|
|
self.enumerate_all_SMs = enumerate_all_SMs
|
|
self.enumerate_all_sugars = enumerate_all_sugars
|
|
date_stamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
|
|
self.smile_file_name = 'LIBRARIES/'+date_stamp+'_mcrl'
|
|
libraries_dir = 'LIBRARIES/'
|
|
if not os.path.exists(libraries_dir):
|
|
os.makedirs(libraries_dir)
|
|
self.info_manager = open(self.smile_file_name + '_info','a+')
|
|
self.info_manager.write(f'Desired Library Size (numbers only) : {self.library_size}\n')
|
|
self.info_manager.write(f'Maximum occurrence of the same structural motifs per scaffold (number only) : {self.max_repeat_motifs}\n')
|
|
self.info_manager.write(f'Minimal number of sugars per scaffold (number only) :{self.minimal_sugars}\n')
|
|
self.info_manager.write(f'Generate all possible stereocenters for extender structural motifs at joining carbons? {self.enumerate_all_SMs}\n')
|
|
self.info_manager.write(f'Generate all possible stereocenters for sugars at joining carbons? {self.enumerate_all_sugars}\n')
|
|
|
|
self.load_sugars()
|
|
self.load_extenders()
|
|
|
|
# def create_directory(self):
|
|
# old_directory = os.getcwd()
|
|
# newfolder = input('Peferred Directory Name for Output Files: ')
|
|
# new_directory = os.path.join(old_directory, newfolder)
|
|
# while os.path.exists(new_directory):
|
|
# print('This folder exists or input is invalid. Try again.')
|
|
# newfolder = input('Peferred Directory Name for Output Files: ')
|
|
# new_directory = os.path.join(old_directory,newfolder)
|
|
# os.mkdir(new_directory)
|
|
# return new_directory
|
|
|
|
def load_sugars(self):
|
|
if self.sugars_file == None:
|
|
with open('Data/sugars', 'r') as f:
|
|
original_sugars = f.read().splitlines()
|
|
else:
|
|
original_sugars = self.sugars_file.read().splitlines()
|
|
if self.enumerate_all_sugars.lower() == 'yes':
|
|
sugars = []
|
|
for i in original_sugars:
|
|
sugars.append(self.ENUMERATE_sugar_stereocenters(i))
|
|
self.sugars = [r.replace('[*R*]','') for r in list(chain(*sugars))]
|
|
else:
|
|
self.sugars = [r.replace('[*R*]','') for r in original_sugars]
|
|
self.make_full_sugar_list() # make self.full_list by adding hydroxyl to self.sugars
|
|
self.info_manager.write('\n\nSugars\n'+'\n'.join(original_sugars)+'\n')
|
|
|
|
def load_extenders(self):
|
|
if self.structural_motifs_file == None:
|
|
with open('Data/selected_extenders.txt','r') as f:
|
|
original_extenders = f.read().splitlines()
|
|
else:
|
|
original_extenders = self.structural_motifs_file.read().splitlines()
|
|
if self.enumerate_all_SMs.lower() == 'yes':
|
|
self.extenders = [r.replace('[*R*]','') for r in self.enumerate_SM_stereocenters(original_extenders)]
|
|
else:
|
|
self.extenders = [r.replace('[*R*]','') for r in original_extenders]
|
|
self.info_manager.write('\n\nStructural Motifs\n'+'\n'.join(original_extenders)+'\n')
|
|
self.info_manager.close()
|
|
|
|
|
|
def make_full_sugar_list(self):
|
|
'''
|
|
self.full_list contains all the sugars and hydroxyl groups.
|
|
'''
|
|
hydroxyl = ["[C@H](O)","[C@@H](O)"]
|
|
self.full_list = self.sugars.copy()
|
|
self.full_list += hydroxyl # contains all sugars and hydroxyl groups
|
|
|
|
|
|
def ENUMERATE_sugar_stereocenters(self, smile):
|
|
'''
|
|
Take in sugar strings that start and end with [*R*], and return a list of sugars with two different stereoceters for the joining carbon.
|
|
'''
|
|
sugar_stereocenters = []
|
|
# if the stereocenter of the joining carbon isn't defined
|
|
if smile[5] is "C":
|
|
# template = smile[0:5] + '[C@H]' + smile[6:]
|
|
template = smile.replace(smile[5], "[C@@H]", 1)
|
|
sugar_stereocenters.append(template)
|
|
template = smile.replace(smile[5], "[C@H]", 1)
|
|
sugar_stereocenters.append(template)
|
|
else:
|
|
sugar_stereocenters.append(smile)
|
|
if "@" in smile[:10]:
|
|
if "@@" in smile[:10]: # for clockwise
|
|
template = smile.replace("@@", "@", 1)
|
|
else:
|
|
template = smile.replace("@", "@@", 1)
|
|
sugar_stereocenters.append(template)
|
|
return sugar_stereocenters
|
|
|
|
|
|
def remove_SM_digits(self, smile):
|
|
'''
|
|
Take a string, and locate places for replacement. They are indicated by [1*], [2*], etc....
|
|
Return the string with all these joints replaced with [*]s.
|
|
'''
|
|
numbers = set(re.findall(r'\d+', smile))
|
|
possible_joints = ["["+str(m) +"*]" for m in numbers]
|
|
for each in possible_joints:
|
|
smile = smile.replace(each, "[*]")
|
|
return smile
|
|
|
|
# def generate_templates_withextenders(self, smile):
|
|
# '''
|
|
# Generate all possible templates. Takes in a smile string (structural core).
|
|
# This function only deals with extenders or structural motifs.
|
|
# Then, insert all possible extenders at those joint positions.
|
|
# '''
|
|
# smile_with_stars = [[r] for r in self.remove_SM_digits(smile).split('[*]')] # take a string with [*]s and split into different fragments, convert each fragment into a list
|
|
# counter = 1
|
|
# # smile_with_stars = [ fragment1, fragment2, fragment3 ,...] all are split at joint positions
|
|
# # insert self.extenders in between all fragments (except for the first and last blocks).
|
|
# # so it will be something like [fragment1, [self.extenders], fragment2, [self.extenders], fragment3, [self.extenders] ,...]
|
|
# for i in range(len(smile_with_stars)-1):
|
|
# shuffle(self.extenders)
|
|
# smile_with_stars.insert(counter,self.extenders)
|
|
# counter+=2
|
|
#
|
|
# template = [x for x in smile_with_stars if x != ['']]
|
|
# self.make_compounds(template)
|
|
# return template
|
|
|
|
def generate_templates_withExtendersNSugars(self,smile):
|
|
'''
|
|
Generate all possible templates. Takes in a smile string (structural core).
|
|
This function deals with both structural motifs and sugars.
|
|
Then, insert all possible SMs and sugars at those joint positions.
|
|
'''
|
|
smile_with_stars = self.string_splitter(self.remove_SM_digits(smile), '[*]')
|
|
template = []
|
|
# create a template holder that will have a list of extenders or sugars at the respective split location points and the rest of the core will remain the same.
|
|
# The position of all these fragments (core, extenders, sugars) have to be in the correct order.
|
|
# Each of all these fragments have to be in lists so you can perform product of them later.
|
|
|
|
for each in range(len(smile_with_stars)): # iterate for all the fragments initially split at extender/SM location points, sugar locations are embedded in some fragments within the list
|
|
# works only for the already splitted list based on SM points '[*]'
|
|
if '[*sugar*]' in smile_with_stars[each][0]: #locate sugar portions -- ['[*sugar*]']. length of this is one and it should contain '[*sugar*]'
|
|
sugar_fragments = self.string_splitter(smile_with_stars[each][0],'[*sugar*]')
|
|
template+= sugar_fragments # add sugar fragments
|
|
else:
|
|
template.append(smile_with_stars[each])
|
|
|
|
template = [x for x in template if x != ['']] # [stable_fragment1, [possible extender motifs], stable_fragment2, [possible sugar moieties], ...]
|
|
SM_template = self.insert_SMs(template) # a list of possible extenders inserted at SM locations
|
|
SGR_order = self.generate_dummy_sugar_templates(SM_template,minimal_sugars=self.minimal_sugars) # At least how many sugars do you want in the macrolide scaffold? because of this, more complications.
|
|
|
|
for each in SGR_order:
|
|
current_SYMBOLsugar_template = self.replace_SYMBOLsugars_with_dummies(each,SM_template) # add the lists of sugars and full_list at the dummy positions
|
|
current_sugar_template = self.insert_sugars_to_dummies(current_SYMBOLsugar_template)
|
|
self.make_compounds(current_sugar_template)
|
|
|
|
def make_compounds(self, template):
|
|
max_per_file = 1000000
|
|
file_counter = 1
|
|
written = []
|
|
for item in product(*template): # 使用笛卡尔积枚举所有模板的组合
|
|
if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs:# 确保用户定义的重复约束 # If the count of most common SM is less than or equal to the number set up by the user
|
|
if self.library_size <= max_per_file:
|
|
if self.total_numcompounds <= self.library_size:
|
|
temp = ''.join([str(r) for r in item]) # 将片段组合成字符串
|
|
self.total_numcompounds += 1
|
|
written.append(temp)
|
|
else:
|
|
self.write_to_file(written, file_counter)
|
|
break
|
|
elif self.library_size > max_per_file:
|
|
if self.total_numcompounds <= self.library_size:
|
|
if len(written) <= max_per_file:
|
|
temp = ''.join([str(r) for r in item])
|
|
self.total_numcompounds += 1
|
|
written.append(temp)
|
|
else:
|
|
self.write_to_file(written, file_counter)
|
|
file_counter +=1
|
|
written = []
|
|
else:
|
|
break
|
|
|
|
def write_to_file(self, compound_list, file_counter):
|
|
'''
|
|
When total compound is 1000000 or library size, this function will be called
|
|
to write compounds to file.
|
|
'''
|
|
file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable
|
|
with open(file_temp,'a+') as file_handler:
|
|
file_handler.write('\n'.join(compound_list))
|
|
|
|
# def make_compounds(self,template):
|
|
# written = []
|
|
# file_counter = 1
|
|
# file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable
|
|
# file_handler = open(file_temp,'a+')
|
|
# for item in product(*template):
|
|
# if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs: # If the count of most common SM is less than or equal to the number set up by the user
|
|
# if self.total_numcompounds <= self.library_size:
|
|
# if len(written) < 1000000:
|
|
# temp = ''.join([str(r) for r in item])
|
|
# # m = Chem.MolFromSmiles(temp)
|
|
# self.total_numcompounds += 1
|
|
# written.append(temp)
|
|
# else:
|
|
# file_handler.write('\n'.join(written))
|
|
# file_handler.close()
|
|
# file_counter +=1
|
|
# file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles'
|
|
# file_handler = open(file_temp,'a+')
|
|
# written = []
|
|
# else:
|
|
# file_handler.write('\n'.join(written)) # write smiles in written list
|
|
# break
|
|
# file_handler.close()
|
|
|
|
|
|
def RS_check(self,smile,ringsize):
|
|
'''
|
|
Take in a smile string and the desired ring size. For many of the macrolides, it will be 14.
|
|
If the smile has the ring size of the desired number, return the same smile. If not, return None.
|
|
'''
|
|
m = Chem.MolFromSmiles(smile)
|
|
if m.GetAtomWithIdx(2).IsInRingSize(ringsize):
|
|
return smile
|
|
|
|
def max_occurrence(self, template_list):
|
|
'''
|
|
Take in a list of fragments, and return the most common fragment along with the number of occurrences.
|
|
Returns a tuple of the most common fragment in SMILE format, and its occurrences.
|
|
E.g.('CCCC',5)
|
|
'''
|
|
# most_common,num_most_common = Counter(template_list).most_common(1)[0] # SM, number of occurrences
|
|
c = defaultdict(int)
|
|
for item in template_list:
|
|
c[item] += 1
|
|
return max(c.items(), key=itemgetter(1))
|
|
|
|
|
|
def enumerate_SM_stereocenters(self, ext):
|
|
'''
|
|
Enumerate possible stereostereocenters in the form of nested lists.
|
|
For example, [[SM1_R, SM1_S],[SM2_R, SM2_S], ... , [plain SMs]]
|
|
Append all other stereocenters of SMs in [R,S] and the rest of the plain SMs in one list at the end of the all_possible list.
|
|
Then, combinations will be performed on this list to enumerate all possible templates.
|
|
It returns a nested list templates with all possible stereocenters + one last list of plain SMs.
|
|
|
|
handles SMs without any stereocenters like ketone
|
|
'''
|
|
SM_list = list(set(ext))
|
|
all_stereos = [] # to store SMs with R and S stereocenters
|
|
plain = [] # for SMs without any specified stereocenters
|
|
for i in range(len(SM_list)):
|
|
if "@" in SM_list[i]:
|
|
if "@@" in SM_list[i]: # for clockwise
|
|
new_stereo = SM_list[i].replace('@@','@',1)
|
|
else:
|
|
new_stereo = SM_list[i].replace('@','@@',1)
|
|
all_stereos.append([SM_list[i],new_stereo]) # Both stereocenters added as a list [R,S]
|
|
else:
|
|
plain.append(SM_list[i]) # for anticlockwise
|
|
|
|
all_possible = all_stereos+[plain]
|
|
all_possible = list(chain(*all_possible))
|
|
|
|
return all_possible
|
|
|
|
def replace_string_with_list(self,string, old, new):
|
|
'''
|
|
input ---> string = '1[*]234[*]5[*]6', old = '[*]', new = ['a','b','c']
|
|
output ---> [['1'], ['a', 'b', 'c'], ['234'], ['a', 'b', 'c'], ['5'], ['a', 'b', 'c'], ['6']]
|
|
Takes in a string, split them at "old" positions at which "new" list is added.
|
|
Returns a nested list of all fragments as shown in output.
|
|
'''
|
|
counter = 1
|
|
frag = [[s] for s in string.split(old)]
|
|
for i in range(len(frag)-1):
|
|
frag.insert(counter,new)
|
|
counter+=2
|
|
return frag
|
|
|
|
|
|
def string_splitter(self, string, split_character):
|
|
'''
|
|
Very similar to self.replace_string_with_list.
|
|
input ---> string = '1[*]234[*]5[*]6', old = '[*]'
|
|
output ---> ['1', '[*]', '234', '[*]', '5', '[*]', '6']
|
|
Takes in a string and split it at the positions of split character.
|
|
Returns a list of all the split fragments, including the split characters. Each individual item in the list is a string.
|
|
'''
|
|
counter = 1
|
|
frag = [[s] for s in string.split(split_character)]
|
|
for i in range(len(frag)-1):
|
|
frag.insert(counter,[split_character])
|
|
counter+=2
|
|
return frag
|
|
|
|
def insert_SMs(self, template):
|
|
'''
|
|
Takes in a template resulted from string_splitter; it would look like
|
|
--->
|
|
template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']]
|
|
Replace ['[*]'] with SM list.
|
|
'''
|
|
# for SMs
|
|
SM_indexes = [index for index, value in enumerate(template) if value == ['[*]']]
|
|
for i in SM_indexes:
|
|
template[i] = self.extenders
|
|
# for sugars
|
|
return template
|
|
|
|
def generate_dummy_sugar_templates(self, template, minimal_sugars=1):
|
|
'''
|
|
n = the least number of sugars the users want in each macrolide.
|
|
Default is one, i.e. there will be at least one sugar in each macrolide.
|
|
Generate a list of all possible templates using dummys as 'SUGARS' (intended for only sugars) and 'FULL_LIST (intended for sugars + hydroxy).'
|
|
'''
|
|
num_sugars = template.count(['[*sugar*]'])
|
|
list_with_atLeast_nSugars = minimal_sugars*['SUGARS']+(num_sugars-minimal_sugars)*['FULL_LIST'] # Make a new list with at least "n" "SUGARS" and the rest "FULL_LIST"
|
|
sugar_lists_in_order = [] # make a new list to hold all possible sugar templates at each position
|
|
# now create all different arrangements of sugars, full_list. The positions of these blocks matter.
|
|
for i in permutations(list_with_atLeast_nSugars):
|
|
if i not in sugar_lists_in_order:
|
|
sugar_lists_in_order.append(i)
|
|
return sugar_lists_in_order
|
|
|
|
|
|
def replace_SYMBOLsugars_with_dummies(self, sugar_dummy_order,template_with_sugarinlist):
|
|
'''
|
|
each sugar_dummy_order looks like ('SUGARS', 'FULL_LIST', 'FULL_LIST', 'FULL_LIST')
|
|
template_with_sugarinlist looks like = [['1'], [ext1,ext2,...], ['2'],['[*sugar*]'],['3'], [ext1,ext2,...], ['[*sugar*]'],['4'], [ext1,ext2,...], ['5'], [ext1,ext2,...], ['6']]
|
|
'''
|
|
copy = template_with_sugarinlist.copy() # create a copy of the template_with_sugarinlist so it doesn't go and modify the original template
|
|
sugar_indexes = [index for index, value in enumerate(template_with_sugarinlist) if value == ['[*sugar*]']]
|
|
for i in range(len(sugar_indexes)):
|
|
copy[sugar_indexes[i]] = sugar_dummy_order[i] # match the dummies at the correct sugar positions
|
|
return copy
|
|
|
|
def insert_sugars_to_dummies(self, template):
|
|
'''
|
|
Takes in a template resulted from string_splitter; it would look like
|
|
--->
|
|
template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']]
|
|
Replace ['[*]'] with SM list.
|
|
'''
|
|
# for SMs
|
|
sugar_indexes = [index for index, value in enumerate(template) if value == 'SUGARS']
|
|
for i in sugar_indexes:
|
|
template[i] = self.sugars
|
|
full_indexes = [index for index, value in enumerate(template) if value == 'FULL_LIST']
|
|
for j in full_indexes:
|
|
template[j] = self.full_list
|
|
# for sugars
|
|
return template
|