#------------------------------------------------------------------------------- # Name: module1 # Purpose: # # Author: zinph # # Created: 01/11/2016 # Copyright: (c) zinph 2016 # Licence: #------------------------------------------------------------------------------- import os import re import os.path import datetime from molvs import standardize_smiles from random import * from rdkit import Chem from rdkit.Chem import Descriptors from rdkit.Chem import AllChem from itertools import * from collections import Counter, defaultdict from rdkit.Chem.Scaffolds import MurckoScaffold from operator import itemgetter class SIME: def __init__(self, structural_motifs_file, sugars_file, max_repeat_motifs, minimal_sugars, library_size, enumerate_all_SMs, enumerate_all_sugars): ## self.info = {} # to record number of compounds for each length self.total_numcompounds = 0 self.structural_motifs_file = structural_motifs_file self.sugars_file = sugars_file self.max_repeat_motifs = max_repeat_motifs self.minimal_sugars = minimal_sugars self.library_size = library_size self.enumerate_all_SMs = enumerate_all_SMs self.enumerate_all_sugars = enumerate_all_sugars date_stamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") self.smile_file_name = 'LIBRARIES/'+date_stamp+'_mcrl' self.info_manager = open(self.smile_file_name + '_info','a+') self.info_manager.write(f'Desired Library Size (numbers only) : {self.library_size}\n') self.info_manager.write(f'Maximum occurrence of the same structural motifs per scaffold (number only) : {self.max_repeat_motifs}\n') self.info_manager.write(f'Minimal number of sugars per scaffold (number only) :{self.minimal_sugars}\n') self.info_manager.write(f'Generate all possible stereocenters for extender structural motifs at joining carbons? {self.enumerate_all_SMs}\n') self.info_manager.write(f'Generate all possible stereocenters for sugars at joining carbons? {self.enumerate_all_sugars}\n') self.load_sugars() self.load_extenders() # def create_directory(self): # old_directory = os.getcwd() # newfolder = input('Peferred Directory Name for Output Files: ') # new_directory = os.path.join(old_directory, newfolder) # while os.path.exists(new_directory): # print('This folder exists or input is invalid. Try again.') # newfolder = input('Peferred Directory Name for Output Files: ') # new_directory = os.path.join(old_directory,newfolder) # os.mkdir(new_directory) # return new_directory def load_sugars(self): if self.sugars_file == None: with open('Data/sugars', 'r') as f: original_sugars = f.read().splitlines() else: original_sugars = self.sugars_file.read().splitlines() if self.enumerate_all_sugars.lower() == 'yes': sugars = [] for i in original_sugars: sugars.append(self.ENUMERATE_sugar_stereocenters(i)) self.sugars = [r.replace('[*R*]','') for r in list(chain(*sugars))] else: self.sugars = [r.replace('[*R*]','') for r in original_sugars] self.make_full_sugar_list() # make self.full_list by adding hydroxyl to self.sugars self.info_manager.write('\n\nSugars\n'+'\n'.join(original_sugars)+'\n') def load_extenders(self): if self.structural_motifs_file == None: with open('Data/selected_extenders.txt','r') as f: original_extenders = f.read().splitlines() else: original_extenders = self.structural_motifs_file.read().splitlines() if self.enumerate_all_SMs.lower() == 'yes': self.extenders = [r.replace('[*R*]','') for r in self.enumerate_SM_stereocenters(original_extenders)] else: self.extenders = [r.replace('[*R*]','') for r in original_extenders] self.info_manager.write('\n\nStructural Motifs\n'+'\n'.join(original_extenders)+'\n') self.info_manager.close() def make_full_sugar_list(self): ''' self.full_list contains all the sugars and hydroxyl groups. ''' hydroxyl = ["[C@H](O)","[C@@H](O)"] self.full_list = self.sugars.copy() self.full_list += hydroxyl # contains all sugars and hydroxyl groups def ENUMERATE_sugar_stereocenters(self, smile): ''' Take in sugar strings that start and end with [*R*], and return a list of sugars with two different stereoceters for the joining carbon. ''' sugar_stereocenters = [] # if the stereocenter of the joining carbon isn't defined if smile[5] is "C": # template = smile[0:5] + '[C@H]' + smile[6:] template = smile.replace(smile[5], "[C@@H]", 1) sugar_stereocenters.append(template) template = smile.replace(smile[5], "[C@H]", 1) sugar_stereocenters.append(template) else: sugar_stereocenters.append(smile) if "@" in smile[:10]: if "@@" in smile[:10]: # for clockwise template = smile.replace("@@", "@", 1) else: template = smile.replace("@", "@@", 1) sugar_stereocenters.append(template) return sugar_stereocenters def remove_SM_digits(self, smile): ''' Take a string, and locate places for replacement. They are indicated by [1*], [2*], etc.... Return the string with all these joints replaced with [*]s. ''' numbers = set(re.findall(r'\d+', smile)) possible_joints = ["["+str(m) +"*]" for m in numbers] for each in possible_joints: smile = smile.replace(each, "[*]") return smile # def generate_templates_withextenders(self, smile): # ''' # Generate all possible templates. Takes in a smile string (structural core). # This function only deals with extenders or structural motifs. # Then, insert all possible extenders at those joint positions. # ''' # smile_with_stars = [[r] for r in self.remove_SM_digits(smile).split('[*]')] # take a string with [*]s and split into different fragments, convert each fragment into a list # counter = 1 # # smile_with_stars = [ fragment1, fragment2, fragment3 ,...] all are split at joint positions # # insert self.extenders in between all fragments (except for the first and last blocks). # # so it will be something like [fragment1, [self.extenders], fragment2, [self.extenders], fragment3, [self.extenders] ,...] # for i in range(len(smile_with_stars)-1): # shuffle(self.extenders) # smile_with_stars.insert(counter,self.extenders) # counter+=2 # # template = [x for x in smile_with_stars if x != ['']] # self.make_compounds(template) # return template def generate_templates_withExtendersNSugars(self,smile): ''' Generate all possible templates. Takes in a smile string (structural core). This function deals with both structural motifs and sugars. Then, insert all possible SMs and sugars at those joint positions. ''' smile_with_stars = self.string_splitter(self.remove_SM_digits(smile), '[*]') template = [] # create a template holder that will have a list of extenders or sugars at the respective split location points and the rest of the core will remain the same. # The position of all these fragments (core, extenders, sugars) have to be in the correct order. # Each of all these fragments have to be in lists so you can perform product of them later. for each in range(len(smile_with_stars)): # iterate for all the fragments initially split at extender/SM location points, sugar locations are embedded in some fragments within the list # works only for the already splitted list based on SM points '[*]' if '[*sugar*]' in smile_with_stars[each][0]: #locate sugar portions -- ['[*sugar*]']. length of this is one and it should contain '[*sugar*]' sugar_fragments = self.string_splitter(smile_with_stars[each][0],'[*sugar*]') template+= sugar_fragments # add sugar fragments else: template.append(smile_with_stars[each]) template = [x for x in template if x != ['']] # [stable_fragment1, [possible extender motifs], stable_fragment2, [possible sugar moieties], ...] SM_template = self.insert_SMs(template) # a list of possible extenders inserted at SM locations SGR_order = self.generate_dummy_sugar_templates(SM_template,minimal_sugars=self.minimal_sugars) # At least how many sugars do you want in the macrolide scaffold? because of this, more complications. for each in SGR_order: current_SYMBOLsugar_template = self.replace_SYMBOLsugars_with_dummies(each,SM_template) # add the lists of sugars and full_list at the dummy positions current_sugar_template = self.insert_sugars_to_dummies(current_SYMBOLsugar_template) self.make_compounds(current_sugar_template) def make_compounds(self, template): max_per_file = 1000000 file_counter = 1 written = [] for item in product(*template): if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs: # If the count of most common SM is less than or equal to the number set up by the user if self.library_size <= max_per_file: if self.total_numcompounds <= self.library_size: temp = ''.join([str(r) for r in item]) self.total_numcompounds += 1 written.append(temp) else: self.write_to_file(written, file_counter) break elif self.library_size > max_per_file: if self.total_numcompounds <= self.library_size: if len(written) <= max_per_file: temp = ''.join([str(r) for r in item]) self.total_numcompounds += 1 written.append(temp) else: self.write_to_file(written, file_counter) file_counter +=1 written = [] else: break def write_to_file(self, compound_list, file_counter): ''' When total compound is 1000000 or library size, this function will be called to write compounds to file. ''' file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable with open(file_temp,'a+') as file_handler: file_handler.write('\n'.join(compound_list)) # def make_compounds(self,template): # written = [] # file_counter = 1 # file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable # file_handler = open(file_temp,'a+') # for item in product(*template): # if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs: # If the count of most common SM is less than or equal to the number set up by the user # if self.total_numcompounds <= self.library_size: # if len(written) < 1000000: # temp = ''.join([str(r) for r in item]) # # m = Chem.MolFromSmiles(temp) # self.total_numcompounds += 1 # written.append(temp) # else: # file_handler.write('\n'.join(written)) # file_handler.close() # file_counter +=1 # file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # file_handler = open(file_temp,'a+') # written = [] # else: # file_handler.write('\n'.join(written)) # write smiles in written list # break # file_handler.close() def RS_check(self,smile,ringsize): ''' Take in a smile string and the desired ring size. For many of the macrolides, it will be 14. If the smile has the ring size of the desired number, return the same smile. If not, return None. ''' m = Chem.MolFromSmiles(smile) if m.GetAtomWithIdx(2).IsInRingSize(ringsize): return smile def max_occurrence(self, template_list): ''' Take in a list of fragments, and return the most common fragment along with the number of occurrences. Returns a tuple of the most common fragment in SMILE format, and its occurrences. E.g.('CCCC',5) ''' # most_common,num_most_common = Counter(template_list).most_common(1)[0] # SM, number of occurrences c = defaultdict(int) for item in template_list: c[item] += 1 return max(c.items(), key=itemgetter(1)) def enumerate_SM_stereocenters(self, ext): ''' Enumerate possible stereostereocenters in the form of nested lists. For example, [[SM1_R, SM1_S],[SM2_R, SM2_S], ... , [plain SMs]] Append all other stereocenters of SMs in [R,S] and the rest of the plain SMs in one list at the end of the all_possible list. Then, combinations will be performed on this list to enumerate all possible templates. It returns a nested list templates with all possible stereocenters + one last list of plain SMs. handles SMs without any stereocenters like ketone ''' SM_list = list(set(ext)) all_stereos = [] # to store SMs with R and S stereocenters plain = [] # for SMs without any specified stereocenters for i in range(len(SM_list)): if "@" in SM_list[i]: if "@@" in SM_list[i]: # for clockwise new_stereo = SM_list[i].replace('@@','@',1) else: new_stereo = SM_list[i].replace('@','@@',1) all_stereos.append([SM_list[i],new_stereo]) # Both stereocenters added as a list [R,S] else: plain.append(SM_list[i]) # for anticlockwise all_possible = all_stereos+[plain] all_possible = list(chain(*all_possible)) return all_possible def replace_string_with_list(self,string, old, new): ''' input ---> string = '1[*]234[*]5[*]6', old = '[*]', new = ['a','b','c'] output ---> [['1'], ['a', 'b', 'c'], ['234'], ['a', 'b', 'c'], ['5'], ['a', 'b', 'c'], ['6']] Takes in a string, split them at "old" positions at which "new" list is added. Returns a nested list of all fragments as shown in output. ''' counter = 1 frag = [[s] for s in string.split(old)] for i in range(len(frag)-1): frag.insert(counter,new) counter+=2 return frag def string_splitter(self, string, split_character): ''' Very similar to self.replace_string_with_list. input ---> string = '1[*]234[*]5[*]6', old = '[*]' output ---> ['1', '[*]', '234', '[*]', '5', '[*]', '6'] Takes in a string and split it at the positions of split character. Returns a list of all the split fragments, including the split characters. Each individual item in the list is a string. ''' counter = 1 frag = [[s] for s in string.split(split_character)] for i in range(len(frag)-1): frag.insert(counter,[split_character]) counter+=2 return frag def insert_SMs(self, template): ''' Takes in a template resulted from string_splitter; it would look like ---> template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']] Replace ['[*]'] with SM list. ''' # for SMs SM_indexes = [index for index, value in enumerate(template) if value == ['[*]']] for i in SM_indexes: template[i] = self.extenders # for sugars return template def generate_dummy_sugar_templates(self, template, minimal_sugars=1): ''' n = the least number of sugars the users want in each macrolide. Default is one, i.e. there will be at least one sugar in each macrolide. Generate a list of all possible templates using dummys as 'SUGARS' (intended for only sugars) and 'FULL_LIST (intended for sugars + hydroxy).' ''' num_sugars = template.count(['[*sugar*]']) list_with_atLeast_nSugars = minimal_sugars*['SUGARS']+(num_sugars-minimal_sugars)*['FULL_LIST'] # Make a new list with at least "n" "SUGARS" and the rest "FULL_LIST" sugar_lists_in_order = [] # make a new list to hold all possible sugar templates at each position # now create all different arrangements of sugars, full_list. The positions of these blocks matter. for i in permutations(list_with_atLeast_nSugars): if i not in sugar_lists_in_order: sugar_lists_in_order.append(i) return sugar_lists_in_order def replace_SYMBOLsugars_with_dummies(self, sugar_dummy_order,template_with_sugarinlist): ''' each sugar_dummy_order looks like ('SUGARS', 'FULL_LIST', 'FULL_LIST', 'FULL_LIST') template_with_sugarinlist looks like = [['1'], [ext1,ext2,...], ['2'],['[*sugar*]'],['3'], [ext1,ext2,...], ['[*sugar*]'],['4'], [ext1,ext2,...], ['5'], [ext1,ext2,...], ['6']] ''' copy = template_with_sugarinlist.copy() # create a copy of the template_with_sugarinlist so it doesn't go and modify the original template sugar_indexes = [index for index, value in enumerate(template_with_sugarinlist) if value == ['[*sugar*]']] for i in range(len(sugar_indexes)): copy[sugar_indexes[i]] = sugar_dummy_order[i] # match the dummies at the correct sugar positions return copy def insert_sugars_to_dummies(self, template): ''' Takes in a template resulted from string_splitter; it would look like ---> template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']] Replace ['[*]'] with SM list. ''' # for SMs sugar_indexes = [index for index, value in enumerate(template) if value == 'SUGARS'] for i in sugar_indexes: template[i] = self.sugars full_indexes = [index for index, value in enumerate(template) if value == 'FULL_LIST'] for j in full_indexes: template[j] = self.full_list # for sugars return template