SIME/SIME.py

#-------------------------------------------------------------------------------
# Name:        module1
# Purpose:
#
# Author:      zinph
#
# Created:     01/11/2016
# Copyright:   (c) zinph 2016
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import os
import re
import os.path
import datetime
from molvs import standardize_smiles
from random import *
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from itertools import *
from collections import Counter, defaultdict
from rdkit.Chem.Scaffolds import MurckoScaffold
from operator import itemgetter

class SIME:

    def __init__(self, structural_motifs_file, sugars_file, max_repeat_motifs, minimal_sugars, library_size, enumerate_all_SMs, enumerate_all_sugars):

##        self.info = {}  # to record number of compounds for each length
        self.total_numcompounds      = 0
        self.structural_motifs_file  = structural_motifs_file
        self.sugars_file             = sugars_file
        self.max_repeat_motifs       = max_repeat_motifs
        self.minimal_sugars          = minimal_sugars
        self.library_size            = library_size
        self.enumerate_all_SMs       = enumerate_all_SMs
        self.enumerate_all_sugars    = enumerate_all_sugars
        date_stamp                   = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
        self.smile_file_name         = 'LIBRARIES/'+date_stamp+'_mcrl'
        libraries_dir = 'LIBRARIES/'
        if not os.path.exists(libraries_dir):
            os.makedirs(libraries_dir)
        self.info_manager = open(self.smile_file_name + '_info','a+')
        self.info_manager.write(f'Desired Library Size (numbers only) : {self.library_size}\n')
        self.info_manager.write(f'Maximum occurrence of the same structural motifs per scaffold (number only) : {self.max_repeat_motifs}\n')
        self.info_manager.write(f'Minimal number of sugars per scaffold (number only) :{self.minimal_sugars}\n')
        self.info_manager.write(f'Generate all possible stereocenters for extender structural motifs at joining carbons? {self.enumerate_all_SMs}\n')
        self.info_manager.write(f'Generate all possible stereocenters for sugars at joining carbons? {self.enumerate_all_sugars}\n')

        self.load_sugars()
        self.load_extenders()

    # def create_directory(self):
    #     old_directory = os.getcwd()
    #     newfolder = input('Peferred Directory Name for Output Files: ')
    #     new_directory = os.path.join(old_directory, newfolder)
    #     while os.path.exists(new_directory):
    #         print('This folder exists or input is invalid. Try again.')
    #         newfolder = input('Peferred Directory Name for Output Files: ')
    #         new_directory = os.path.join(old_directory,newfolder)
    #     os.mkdir(new_directory)
    #     return new_directory

    def load_sugars(self):
        if self.sugars_file == None:
            with open('Data/sugars', 'r') as f:
                original_sugars = f.read().splitlines()
        else:
            original_sugars = self.sugars_file.read().splitlines()
        if self.enumerate_all_sugars.lower() == 'yes':
            sugars = []
            for i in original_sugars:
                sugars.append(self.ENUMERATE_sugar_stereocenters(i))
            self.sugars = [r.replace('[*R*]','') for r in list(chain(*sugars))]
        else:
            self.sugars = [r.replace('[*R*]','') for r in original_sugars]
        self.make_full_sugar_list() # make self.full_list by adding hydroxyl to self.sugars
        self.info_manager.write('\n\nSugars\n'+'\n'.join(original_sugars)+'\n')

    def load_extenders(self):
        if self.structural_motifs_file == None:
            with open('Data/selected_extenders.txt','r') as f:
                original_extenders = f.read().splitlines()
        else:
            original_extenders = self.structural_motifs_file.read().splitlines()
        if self.enumerate_all_SMs.lower() == 'yes':
            self.extenders = [r.replace('[*R*]','') for r in self.enumerate_SM_stereocenters(original_extenders)]
        else:
            self.extenders = [r.replace('[*R*]','') for r in original_extenders]
        self.info_manager.write('\n\nStructural Motifs\n'+'\n'.join(original_extenders)+'\n')
        self.info_manager.close()


    def make_full_sugar_list(self):
        '''
        self.full_list contains all the sugars and hydroxyl groups.
        '''
        hydroxyl = ["[C@H](O)","[C@@H](O)"]
        self.full_list = self.sugars.copy()
        self.full_list += hydroxyl # contains all sugars and hydroxyl groups


    def ENUMERATE_sugar_stereocenters(self, smile):
        '''
        Take in sugar strings that start and end with [*R*], and return a list of sugars with two different stereoceters for the joining carbon.
        '''
        sugar_stereocenters = []
        # if the stereocenter of the joining carbon isn't defined
        if smile[5] is "C":
#            template = smile[0:5] + '[C@H]' + smile[6:]
            template = smile.replace(smile[5], "[C@@H]", 1)
            sugar_stereocenters.append(template)
            template = smile.replace(smile[5], "[C@H]", 1)
            sugar_stereocenters.append(template)
        else:
            sugar_stereocenters.append(smile)
            if "@" in smile[:10]:
                if "@@" in smile[:10]:   # for clockwise
                    template = smile.replace("@@", "@", 1)
                else:
                    template = smile.replace("@", "@@", 1)
                sugar_stereocenters.append(template)
        return sugar_stereocenters


    def remove_SM_digits(self, smile):
        '''
        Take a string, and locate places for replacement. They are indicated by [1*], [2*], etc....
        Return the string with all these joints replaced with [*]s.
        '''
        numbers = set(re.findall(r'\d+', smile))
        possible_joints = ["["+str(m) +"*]" for m in numbers]
        for each in possible_joints:
            smile = smile.replace(each, "[*]")
        return smile

    # def generate_templates_withextenders(self, smile):
    #     '''
    #     Generate all possible templates. Takes in a smile string (structural core).
    #     This function only deals with extenders or structural motifs.
    #     Then, insert all possible extenders at those joint positions.
    #     '''
    #     smile_with_stars = [[r] for r in self.remove_SM_digits(smile).split('[*]')]  # take a string with [*]s and split into different fragments, convert each fragment into a list
    #     counter = 1
    #     # smile_with_stars = [ fragment1, fragment2, fragment3 ,...] all are split at joint positions
    #     # insert self.extenders in between all fragments (except for the first and last blocks).
    #     # so it will be something like [fragment1, [self.extenders], fragment2, [self.extenders], fragment3, [self.extenders] ,...]
    #     for i in range(len(smile_with_stars)-1):
    #         shuffle(self.extenders)
    #         smile_with_stars.insert(counter,self.extenders)
    #         counter+=2
    #
    #     template = [x for x in smile_with_stars if x != ['']]
    #     self.make_compounds(template)
    #     return template

    def generate_templates_withExtendersNSugars(self,smile):
        '''
        Generate all possible templates. Takes in a smile string (structural core).
        This function deals with both structural motifs and sugars.
        Then, insert all possible SMs and sugars at those joint positions.
        '''
        smile_with_stars = self.string_splitter(self.remove_SM_digits(smile), '[*]')
        template = []
        # create a template holder that will have a list of extenders or sugars at the respective split location points and the rest of the core will remain the same.
        # The position of all these fragments (core, extenders, sugars) have to be in the correct order.
        # Each of all these fragments have to be in lists so you can perform product of them later.

        for each in range(len(smile_with_stars)):  # iterate for all the fragments initially split at extender/SM location points, sugar locations are embedded in some fragments within the list
            # works only for the already splitted list based on SM points '[*]'
            if '[*sugar*]' in smile_with_stars[each][0]: #locate sugar portions -- ['[*sugar*]'].  length of this is one and it should contain '[*sugar*]'
                sugar_fragments = self.string_splitter(smile_with_stars[each][0],'[*sugar*]')
                template+= sugar_fragments      # add sugar fragments
            else:
                template.append(smile_with_stars[each])

        template = [x for x in template if x != ['']]  # [stable_fragment1, [possible extender motifs], stable_fragment2, [possible sugar moieties], ...]
        SM_template = self.insert_SMs(template)  # a list of possible extenders inserted at SM locations
        SGR_order = self.generate_dummy_sugar_templates(SM_template,minimal_sugars=self.minimal_sugars)  # At least how many sugars do you want in the macrolide scaffold? because of this, more complications.

        for each in SGR_order:
            current_SYMBOLsugar_template = self.replace_SYMBOLsugars_with_dummies(each,SM_template) # add the lists of sugars and full_list at the dummy positions
            current_sugar_template = self.insert_sugars_to_dummies(current_SYMBOLsugar_template)
            self.make_compounds(current_sugar_template)

    def make_compounds(self, template):
        max_per_file = 1000000
        file_counter = 1
        written      = []
        for item in product(*template): # 使用笛卡尔积枚举所有模板的组合
            if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs:# 确保用户定义的重复约束 # If the count of most common SM is less than or equal to the number set up by the user
                if self.library_size <= max_per_file:
                    if self.total_numcompounds <= self.library_size:
                        temp = ''.join([str(r) for r in item]) # 将片段组合成字符串
                        self.total_numcompounds += 1
                        written.append(temp)
                    else:
                        self.write_to_file(written, file_counter)
                        break
                elif self.library_size > max_per_file:
                    if self.total_numcompounds <= self.library_size:
                        if len(written) <= max_per_file:
                            temp = ''.join([str(r) for r in item])
                            self.total_numcompounds += 1
                            written.append(temp)
                        else:
                            self.write_to_file(written, file_counter)
                            file_counter +=1
                            written = []
                    else:
                        break

    def write_to_file(self, compound_list, file_counter):
        '''
        When total compound is 1000000 or library size, this function will be called
        to write compounds to file.
        '''
        file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable
        with open(file_temp,'a+') as file_handler:
            file_handler.write('\n'.join(compound_list))

    # def make_compounds(self,template):
    #     written = []
    #     file_counter = 1
    #     file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles' # attempts to split files because they get too large. Name of the first file will be "file_" + this variable
    #     file_handler = open(file_temp,'a+')
    #     for item in product(*template):
    #         if self.max_occurrence(list(item))[1] <= self.max_repeat_motifs: # If the count of most common SM is less than or equal to the number set up by the user
    #             if self.total_numcompounds <= self.library_size:
    #                 if len(written) < 1000000:
    #                     temp = ''.join([str(r) for r in item])
    #                     # m = Chem.MolFromSmiles(temp)
    #                     self.total_numcompounds += 1
    #                     written.append(temp)
    #                 else:
    #                     file_handler.write('\n'.join(written))
    #                     file_handler.close()
    #                     file_counter +=1
    #                     file_temp = self.smile_file_name + '_'+str(file_counter)+'.smiles'
    #                     file_handler = open(file_temp,'a+')
    #                     written = []
    #             else:
    #                 file_handler.write('\n'.join(written))  # write smiles in written list
    #                 break
    #     file_handler.close()


    def RS_check(self,smile,ringsize):
        '''
        Take in a smile string and the desired ring size. For many of the macrolides, it will be 14.
        If the smile has the ring size of the desired number, return the same smile. If not, return None.
        '''
        m  = Chem.MolFromSmiles(smile)
        if m.GetAtomWithIdx(2).IsInRingSize(ringsize):
            return smile

    def max_occurrence(self, template_list):
        '''
        Take in a list of fragments, and return the most common fragment along with the number of occurrences.
        Returns a tuple of the most common fragment in SMILE format, and its occurrences.
        E.g.('CCCC',5)
        '''
#        most_common,num_most_common = Counter(template_list).most_common(1)[0] # SM, number of occurrences
        c = defaultdict(int)
        for item in template_list:
            c[item] += 1
        return max(c.items(), key=itemgetter(1))


    def enumerate_SM_stereocenters(self, ext):
        '''
        Enumerate possible stereostereocenters in the form of nested lists.
        For example, [[SM1_R, SM1_S],[SM2_R, SM2_S], ... , [plain SMs]]
        Append all other stereocenters of SMs in [R,S] and the rest of the plain SMs in one list at the end of the all_possible list.
        Then, combinations will be performed on this list to enumerate all possible templates.
        It returns a nested list templates with all possible stereocenters + one last list of plain SMs.

        handles SMs without any stereocenters like ketone
        '''
        SM_list = list(set(ext))
        all_stereos = []  # to store SMs with R and S stereocenters
        plain = []  # for SMs without any specified stereocenters
        for i in range(len(SM_list)):
            if "@" in SM_list[i]:
                if "@@" in SM_list[i]:   # for clockwise
                    new_stereo = SM_list[i].replace('@@','@',1)
                else:
                    new_stereo = SM_list[i].replace('@','@@',1)
                all_stereos.append([SM_list[i],new_stereo])   # Both stereocenters added as a list [R,S]
            else:
                plain.append(SM_list[i])  # for anticlockwise

        all_possible = all_stereos+[plain]
        all_possible = list(chain(*all_possible))

        return all_possible

    def replace_string_with_list(self,string, old, new):
        '''
        input  ---> string = '1[*]234[*]5[*]6', old = '[*]', new = ['a','b','c']
        output ---> [['1'], ['a', 'b', 'c'], ['234'], ['a', 'b', 'c'], ['5'], ['a', 'b', 'c'], ['6']]
        Takes in a string, split them at "old" positions at which "new" list is added.
        Returns a nested list of all fragments as shown in output.
        '''
        counter = 1
        frag = [[s] for s in string.split(old)]
        for i in range(len(frag)-1):
            frag.insert(counter,new)
            counter+=2
        return frag


    def string_splitter(self, string, split_character):
        '''
        Very similar to self.replace_string_with_list.
        input  ---> string = '1[*]234[*]5[*]6', old = '[*]'
        output ---> ['1', '[*]', '234', '[*]', '5', '[*]', '6']
        Takes in a string and split it at the positions of split character.
        Returns a list of all the split fragments, including the split characters. Each individual item in the list is a string.
        '''
        counter = 1
        frag = [[s] for s in string.split(split_character)]
        for i in range(len(frag)-1):
            frag.insert(counter,[split_character])
            counter+=2
        return frag

    def insert_SMs(self, template):
        '''
        Takes in a template resulted from string_splitter; it would look like
        --->
        template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']]
        Replace ['[*]'] with SM list.
        '''
        # for SMs
        SM_indexes = [index for index, value in enumerate(template) if value == ['[*]']]
        for i in SM_indexes:
            template[i] = self.extenders
        # for sugars
        return template

    def generate_dummy_sugar_templates(self, template, minimal_sugars=1):
        '''
        n = the least number of sugars the users want in each macrolide.
        Default is one, i.e. there will be at least one sugar in each macrolide.
        Generate a list of all possible templates using dummys as 'SUGARS' (intended for only sugars) and 'FULL_LIST (intended for sugars + hydroxy).'
        '''
        num_sugars = template.count(['[*sugar*]'])
        list_with_atLeast_nSugars = minimal_sugars*['SUGARS']+(num_sugars-minimal_sugars)*['FULL_LIST']  # Make a new list with at least "n" "SUGARS" and the rest "FULL_LIST"
        sugar_lists_in_order = [] # make a new list to hold all possible sugar templates at each position
        # now create all different arrangements of sugars, full_list. The positions of these blocks matter.
        for i in permutations(list_with_atLeast_nSugars):
            if i not in sugar_lists_in_order:
                sugar_lists_in_order.append(i)
        return sugar_lists_in_order


    def replace_SYMBOLsugars_with_dummies(self, sugar_dummy_order,template_with_sugarinlist):
        '''
        each sugar_dummy_order looks like ('SUGARS', 'FULL_LIST', 'FULL_LIST', 'FULL_LIST')
        template_with_sugarinlist looks like = [['1'], [ext1,ext2,...], ['2'],['[*sugar*]'],['3'], [ext1,ext2,...], ['[*sugar*]'],['4'], [ext1,ext2,...], ['5'], [ext1,ext2,...], ['6']]
        '''
        copy = template_with_sugarinlist.copy()  # create a copy of the template_with_sugarinlist so it doesn't go and modify the original template
        sugar_indexes = [index for index, value in enumerate(template_with_sugarinlist) if value == ['[*sugar*]']]
        for i in range(len(sugar_indexes)):
            copy[sugar_indexes[i]] = sugar_dummy_order[i]  # match the dummies at the correct sugar positions
        return copy

    def insert_sugars_to_dummies(self, template):
        '''
        Takes in a template resulted from string_splitter; it would look like
        --->
        template =[['1'], '[*]', ['2'],['[*sugar*]'],['3'], '[*]', ['[*sugar*]'],['4'], '[*]', ['5'], '[*]', ['6']]
        Replace ['[*]'] with SM list.
        '''
        # for SMs
        sugar_indexes = [index for index, value in enumerate(template) if value == 'SUGARS']
        for i in sugar_indexes:
            template[i] = self.sugars
        full_indexes = [index for index, value in enumerate(template) if value == 'FULL_LIST']
        for j in full_indexes:
            template[j] = self.full_list
        # for sugars
        return template