diff --git a/scripts/tylosin_splicer.py b/scripts/tylosin_splicer.py new file mode 100644 index 0000000..71f04d4 --- /dev/null +++ b/scripts/tylosin_splicer.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +""" +Tylosin Splicing System - Main Entry Point +""" + +def main(): + print("Hello from Tylosin Splicer") + +if __name__ == "__main__": + main() diff --git a/src/splicing/__init__.py b/src/splicing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/splicing/engine.py b/src/splicing/engine.py new file mode 100644 index 0000000..3a4e3a9 --- /dev/null +++ b/src/splicing/engine.py @@ -0,0 +1,79 @@ +from rdkit import Chem + +def splice_molecule(scaffold: Chem.Mol, fragment: Chem.Mol, position: int) -> Chem.Mol: + """ + Connects a scaffold and a fragment by replacing a specific dummy atom on the scaffold + and a dummy atom on the fragment with a single bond. + + Args: + scaffold: The scaffold molecule containing labeled dummy atoms (e.g., [1*]). + fragment: The fragment molecule containing a dummy atom (*). + position: The isotope number of the dummy atom on the scaffold to attach to. + + Returns: + Chem.Mol: The spliced molecule. + + Raises: + ValueError: If the specified dummy atom is not found on the scaffold + or if the fragment does not contain a dummy atom. + """ + # 1. Combine molecules + # Note: CombineMols preserves atom indices of mol1 (scaffold), and appends mol2 (fragment). + # Atoms 0 to N-1 are scaffold, N to N+M-1 are fragment. + combined = Chem.CombineMols(scaffold, fragment) + rw_mol = Chem.RWMol(combined) + + scaffold_atom_count = scaffold.GetNumAtoms() + total_atoms = rw_mol.GetNumAtoms() + + # 2. Find Scaffold Dummy + scaffold_dummy_idx = -1 + for i in range(scaffold_atom_count): + atom = rw_mol.GetAtomWithIdx(i) + if atom.GetAtomicNum() == 0 and atom.GetIsotope() == position: + scaffold_dummy_idx = i + break + + if scaffold_dummy_idx == -1: + raise ValueError(f"Scaffold dummy atom with isotope {position} not found") + + # 3. Find Fragment Dummy + # We search in the fragment part (indices >= scaffold_atom_count) + fragment_dummy_idx = -1 + for i in range(scaffold_atom_count, total_atoms): + atom = rw_mol.GetAtomWithIdx(i) + # We assume any dummy atom in the fragment is the attachment point. + # Usually fragment has isotope 0 on its dummy. + if atom.GetAtomicNum() == 0: + fragment_dummy_idx = i + break + + if fragment_dummy_idx == -1: + raise ValueError("Fragment does not contain a dummy atom") + + # 4. Identify Neighbors (Anchors) + scaffold_dummy = rw_mol.GetAtomWithIdx(scaffold_dummy_idx) + if scaffold_dummy.GetDegree() != 1: + raise ValueError(f"Scaffold dummy atom at index {scaffold_dummy_idx} must have exactly one neighbor") + scaffold_anchor_idx = scaffold_dummy.GetNeighbors()[0].GetIdx() + + fragment_dummy = rw_mol.GetAtomWithIdx(fragment_dummy_idx) + if fragment_dummy.GetDegree() != 1: + raise ValueError(f"Fragment dummy atom at index {fragment_dummy_idx} must have exactly one neighbor") + fragment_anchor_idx = fragment_dummy.GetNeighbors()[0].GetIdx() + + # 5. Add Bond + rw_mol.AddBond(scaffold_anchor_idx, fragment_anchor_idx, Chem.BondType.SINGLE) + + # 6. Remove Dummy Atoms + # Remove the higher index first to preserve the lower index + # We know fragment_dummy_idx > scaffold_dummy_idx because fragment atoms were appended. + # However, just to be safe, we sort them. + indices_to_remove = sorted([scaffold_dummy_idx, fragment_dummy_idx], reverse=True) + for idx in indices_to_remove: + rw_mol.RemoveAtom(idx) + + # 7. Sanitize + Chem.SanitizeMol(rw_mol) + + return rw_mol.GetMol() diff --git a/src/splicing/fragment_prep.py b/src/splicing/fragment_prep.py new file mode 100644 index 0000000..788851d --- /dev/null +++ b/src/splicing/fragment_prep.py @@ -0,0 +1,87 @@ +import random +from rdkit import Chem + +def activate_fragment(smiles: str, strategy: str = "smart") -> Chem.Mol: + """ + Convert a small molecule fragment into an attachable R-group by adding a dummy atom (*). + + Args: + smiles: SMILES string of the fragment. + strategy: 'smart' (prioritize heteroatoms) or 'random' (any atom with H). + + Returns: + Chem.Mol: The activated fragment with a dummy atom attached. + """ + mol = Chem.MolFromSmiles(smiles) + if mol is None: + raise ValueError(f"Invalid SMILES string: {smiles}") + + target_idx = -1 + + if strategy == "smart": + # Order of preference: Amine, Alcohol/Phenol, Thiol + # Amine: [N;!H0] - Nitrogen with at least one H + # Alcohol/Phenol: [O;H1] - Oxygen with 1 H (usually 2 bonds total) + # Thiol: [S;H1] + + smarts_patterns = [ + "[N;!H0]", # Primary/Secondary amine + "[O;H1]", # Alcohol/Phenol + "[S;H1]" # Thiol + ] + + for smarts in smarts_patterns: + pattern = Chem.MolFromSmarts(smarts) + if pattern: + matches = mol.GetSubstructMatches(pattern) + if matches: + # Pick the first match + target_idx = matches[0][0] + break + + if target_idx == -1: + # Fallback to random if no smart match found + strategy = "random" + + if strategy == "random": + # Find all atoms with at least one H + candidates = [] + carbon_candidates = [] + + for atom in mol.GetAtoms(): + # GetTotalNumHs includes implicit and explicit Hs + if atom.GetTotalNumHs() > 0: + candidates.append(atom.GetIdx()) + if atom.GetSymbol() == 'C': + carbon_candidates.append(atom.GetIdx()) + + if not candidates: + raise ValueError("No suitable atoms with hydrogens found to attach to.") + + # Prefer Carbon atoms if available + if carbon_candidates: + # Pick the first one for deterministic behavior + target_idx = carbon_candidates[0] + else: + target_idx = candidates[0] + + if target_idx == -1: + # Should be caught by the candidates check, but just in case + raise ValueError("Could not identify a target atom for activation.") + + # Perform attachment + rwmol = Chem.RWMol(mol) + + # Add dummy atom + dummy_idx = rwmol.AddAtom(Chem.Atom('*')) + + # Add bond to target atom + rwmol.AddBond(target_idx, dummy_idx, Chem.BondType.SINGLE) + + # Sanitize to fix implicit H counts and ensure validity + try: + Chem.SanitizeMol(rwmol) + except Exception as e: + raise ValueError(f"Failed to sanitize molecule after activation: {e}") + + return rwmol.GetMol() diff --git a/src/splicing/scaffold_prep.py b/src/splicing/scaffold_prep.py new file mode 100644 index 0000000..3ce30d9 --- /dev/null +++ b/src/splicing/scaffold_prep.py @@ -0,0 +1,123 @@ +from rdkit import Chem +from typing import List, Dict, Tuple, Set, Optional +from src.ring_numbering import assign_ring_numbering + +def get_subtree_indices(mol: Chem.Mol, start_atom_idx: int, forbidden_idx: int) -> Set[int]: + """ + Get all atom indices in the subtree starting at start_atom_idx, + moving away from forbidden_idx. + Used to identify side chain atoms to remove. + """ + subtree = set() + stack = [start_atom_idx] + + while stack: + current = stack.pop() + if current in subtree: + continue + subtree.add(current) + + atom = mol.GetAtomWithIdx(current) + for neighbor in atom.GetNeighbors(): + n_idx = neighbor.GetIdx() + # Traverse neighbors except the one leading back to the ring (forbidden) + # and those already visited + if n_idx != forbidden_idx and n_idx not in subtree: + stack.append(n_idx) + + return subtree + +def prepare_tylosin_scaffold(smiles: str, positions: List[int]) -> Tuple[Chem.Mol, Dict[int, int]]: + """ + Prepare the Tylosin scaffold by removing side chains at specified positions + and marking them with dummy atoms. + + Args: + smiles: SMILES string of the scaffold/molecule. + positions: List of ring positions (1-16) to prepare (add sockets). + + Returns: + Tuple of (Modified Molecule, Dict mapping position -> new_dummy_atom_idx) + The returned molecule has dummy atoms ('*') at the specified positions, + marked with Isotope = position number. + """ + mol = Chem.MolFromSmiles(smiles) + if not mol: + raise ValueError(f"Invalid SMILES: {smiles}") + + # 1. Ring numbering to identify target atoms + ring_map = assign_ring_numbering(mol) # atom_idx -> ring_pos + if not ring_map: + raise ValueError("Could not assign ring numbering. Is this a 16-membered lactone?") + + # Reverse map for easy lookup: ring_pos -> atom_idx + pos_to_atom = {v: k for k, v in ring_map.items()} + + atoms_to_remove = set() + dummies_to_add = [] # List of (ring_atom_idx, position) + + # 2. Identify edits + for pos in positions: + if pos not in pos_to_atom: + raise ValueError(f"Position {pos} not found in ring numbering.") + + ring_atom_idx = pos_to_atom[pos] + ring_atom = mol.GetAtomWithIdx(ring_atom_idx) + + # Identify non-ring neighbors (side chains) + # Note: neighbors in ring have indices in ring_map + side_chain_neighbors = [] + for n in ring_atom.GetNeighbors(): + if n.GetIdx() not in ring_map: + side_chain_neighbors.append(n.GetIdx()) + + # If side chains exist, mark them for removal + if side_chain_neighbors: + for sc_idx in side_chain_neighbors: + subtree = get_subtree_indices(mol, sc_idx, forbidden_idx=ring_atom_idx) + atoms_to_remove.update(subtree) + + # Plan to add dummy at this ring atom + dummies_to_add.append((ring_atom_idx, pos)) + + # 3. Apply edits using RWMol + rwmol = Chem.RWMol(mol) + + # Step A: Add dummy atoms + # We add them before deletion to use stable ring indices. + # Note: Adding atoms does not change existing indices. + + for ring_idx, pos in dummies_to_add: + # Create dummy atom + dummy = Chem.Atom('*') + dummy.SetIsotope(pos) # Mark with position + + new_idx = rwmol.AddAtom(dummy) + + # Add bond to ring atom + rwmol.AddBond(ring_idx, new_idx, Chem.BondType.SINGLE) + + # Step B: Remove side chain atoms + # Sort descending to preserve lower indices during deletion + sorted_remove = sorted(list(atoms_to_remove), reverse=True) + for idx in sorted_remove: + rwmol.RemoveAtom(idx) + + # 4. Finalize + mol_final = rwmol.GetMol() + + try: + Chem.SanitizeMol(mol_final) + except Exception: + # Sometime dummies trigger sanitization errors, but usually partial sanitization works. + # We'll ignore strict sanitization errors for the scaffold as it has dummies. + pass + + # 5. Build result map (position -> atom_index in new mol) + # Since indices shifted, we find dummies by their isotope markers. + final_dummy_map = {} + for atom in mol_final.GetAtoms(): + if atom.GetSymbol() == '*' and atom.GetIsotope() in positions: + final_dummy_map[atom.GetIsotope()] = atom.GetIdx() + + return mol_final, final_dummy_map diff --git a/tests/test_env_integration.py b/tests/test_env_integration.py new file mode 100644 index 0000000..8ed3779 --- /dev/null +++ b/tests/test_env_integration.py @@ -0,0 +1,39 @@ +import sys +import os +from pathlib import Path + +# Add SIME to path +SIME_PATH = "/home/zly/project/SIME" +if SIME_PATH not in sys.path: + sys.path.append(SIME_PATH) + +# Add project root to path so we can import 'src' +PROJECT_ROOT = str(Path(__file__).parent.parent) +if PROJECT_ROOT not in sys.path: + sys.path.append(PROJECT_ROOT) + +def test_imports(): + """Verify that we can import from both local project and SIME.""" + print(f"sys.path: {sys.path}") + + # 1. Test local import from src + try: + # Correct function name based on file inspection + from src.ring_numbering import assign_ring_numbering + assert callable(assign_ring_numbering) + print("Successfully imported src.ring_numbering.assign_ring_numbering") + except ImportError as e: + print(f"Failed to import src.ring_numbering: {e}") + raise + + # 2. Test SIME import + try: + from utils.mole_predictor import ParallelBroadSpectrumPredictor + assert ParallelBroadSpectrumPredictor is not None + print("Successfully imported ParallelBroadSpectrumPredictor from utils.mole_predictor") + except ImportError as e: + print(f"Failed to import from SIME: {e}") + raise + +if __name__ == "__main__": + test_imports() diff --git a/tests/test_fragment_prep.py b/tests/test_fragment_prep.py new file mode 100644 index 0000000..5245f9c --- /dev/null +++ b/tests/test_fragment_prep.py @@ -0,0 +1,95 @@ +import pytest +from rdkit import Chem +from src.splicing.fragment_prep import activate_fragment + +def test_activate_smart_ethanol(): + """Test 'smart' activation on Ethanol (CCO). Should attach to Oxygen.""" + smiles = "CCO" + mol = activate_fragment(smiles, strategy="smart") + + # Check if we have a dummy atom + assert mol is not None + assert mol.GetNumAtoms() == 4 # C, C, O, * + + # Check if the dummy atom is attached to Oxygen + # Find the dummy atom + dummy_atom = None + for atom in mol.GetAtoms(): + if atom.GetSymbol() == '*': + dummy_atom = atom + break + + assert dummy_atom is not None + + # Check neighbors of dummy atom + neighbors = dummy_atom.GetNeighbors() + assert len(neighbors) == 1 + assert neighbors[0].GetSymbol() == 'O' + + # Check output SMILES format + out_smiles = Chem.MolToSmiles(mol) + assert '*' in out_smiles + +def test_activate_smart_amine(): + """Test 'smart' activation on Ethylamine (CCN). Should attach to Nitrogen.""" + smiles = "CCN" + mol = activate_fragment(smiles, strategy="smart") + + assert mol is not None + + # Find the dummy atom + dummy_atom = None + for atom in mol.GetAtoms(): + if atom.GetSymbol() == '*': + dummy_atom = atom + break + + assert dummy_atom is not None + neighbors = dummy_atom.GetNeighbors() + assert neighbors[0].GetSymbol() == 'N' + +def test_activate_random_pentane(): + """Test 'random' activation on Pentane (CCCCC). Should attach to a Carbon.""" + smiles = "CCCCC" + # Seed is not easily passed to the function unless we add it to the signature or fix it inside. + # For this test, any Carbon is fine. + mol = activate_fragment(smiles, strategy="random") + + assert mol is not None + assert mol.GetNumAtoms() == 6 # 5 C + 1 * + + dummy_atom = None + for atom in mol.GetAtoms(): + if atom.GetSymbol() == '*': + dummy_atom = atom + break + + assert dummy_atom is not None + neighbors = dummy_atom.GetNeighbors() + assert neighbors[0].GetSymbol() == 'C' + +def test_activate_smart_fallback(): + """Test 'smart' fallback when no heteroatoms are found (e.g. Propane).""" + smiles = "CCC" + # Should fall back to finding a terminal carbon or random + # The requirement says "fall back to a terminal Carbon" or random. + # Let's assume the implementation picks a terminal carbon if possible, or just behaves like random on C. + mol = activate_fragment(smiles, strategy="smart") + + assert mol is not None + dummy_atom = None + for atom in mol.GetAtoms(): + if atom.GetSymbol() == '*': + dummy_atom = atom + break + + assert dummy_atom is not None + neighbor = dummy_atom.GetNeighbors()[0] + assert neighbor.GetSymbol() == 'C' + # Verify it's a valid molecule + assert Chem.SanitizeMol(mol) == Chem.SanitizeFlags.SANITIZE_NONE + +def test_invalid_smiles(): + with pytest.raises(ValueError): + activate_fragment("NotASmiles", strategy="smart") + diff --git a/tests/test_scaffold_prep.py b/tests/test_scaffold_prep.py new file mode 100644 index 0000000..d1d5a8d --- /dev/null +++ b/tests/test_scaffold_prep.py @@ -0,0 +1,84 @@ +import pytest +from rdkit import Chem +from src.splicing.scaffold_prep import prepare_tylosin_scaffold +from src.ring_numbering import assign_ring_numbering + +def test_prepare_tylosin_scaffold(): + # Construct a 16-membered lactone with side chains + # Numbering logic (assumed based on implementation): + # 1: C=O + # 2-6: CH2 + # 7: CH(CH3) <- Methyl side chain + # 8-14: CH2 + # 15: CH(CC) <- Ethyl side chain + # 16: O + + # SMILES: + # O=C1 (pos 1) + # CCCCC (pos 2-6) + # C(C) (pos 7, with Methyl) + # CCCCCCC (pos 8-14) + # C(CC) (pos 15, with Ethyl) + # O1 (pos 16) + + smiles = "O=C1CCCCC(C)CCCCCCCCC(CC)O1" + + # Verify initial assumption about numbering + mol = Chem.MolFromSmiles(smiles) + numbering = assign_ring_numbering(mol) + + # Find atom indices for pos 7 and 15 to ensure our SMILES construction is correct for the test + pos_map = {v: k for k, v in numbering.items()} + assert 7 in pos_map, "Position 7 not found in ring" + assert 15 in pos_map, "Position 15 not found in ring" + assert 5 in pos_map, "Position 5 not found in ring" + + atom7 = mol.GetAtomWithIdx(pos_map[7]) + atom15 = mol.GetAtomWithIdx(pos_map[15]) + atom5 = mol.GetAtomWithIdx(pos_map[5]) + + # Check side chains exist + # Atom 7 should have 3 neighbors (2 ring, 1 methyl) + assert len(atom7.GetNeighbors()) == 3 + # Atom 15 should have 3 neighbors (2 ring, 1 ethyl) + assert len(atom15.GetNeighbors()) == 3 + # Atom 5 should have 2 neighbors (2 ring, 2 implicit H) + assert len(atom5.GetNeighbors()) == 2 + + # Execute scaffold prep + target_positions = [5, 7, 15] + res_mol, dummy_map = prepare_tylosin_scaffold(smiles, target_positions) + + assert res_mol is not None + assert len(dummy_map) == 3 + + # Verify dummies + for pos in target_positions: + assert pos in dummy_map + dummy_idx = dummy_map[pos] + dummy_atom = res_mol.GetAtomWithIdx(dummy_idx) + assert dummy_atom.GetSymbol() == "*" + assert dummy_atom.GetIsotope() == pos + + # Check that dummy is connected to the correct ring position + neighbors = dummy_atom.GetNeighbors() + assert len(neighbors) == 1 + + # Verify side chains removed + # New atom counts. + # Original: 16 (ring) + 1 (O=) + 1 (Me) + 2 (Et) = 20 heavy atoms. + # Removed: Me (1), Et (2). Total -3. + # Added: 3 dummies. Total +3. + # Net: 20. + assert res_mol.GetNumAtoms() == 20 + + # Check that the specific side chains are gone. + # Count carbons. + # Original C count: 1 (C=O) + 14 (CH2/CH) + 1(Me) + 2(Et) = 18 C. + # New C count: 1 (C=O) + 14 (Ring C) = 15 C. + # Dummies are *. O are O. + c_count = sum(1 for a in res_mol.GetAtoms() if a.GetSymbol() == 'C') + assert c_count == 15, f"Expected 15 Carbons, found {c_count}" + + dummy_count = sum(1 for a in res_mol.GetAtoms() if a.GetSymbol() == '*') + assert dummy_count == 3 diff --git a/tests/test_splicing_engine.py b/tests/test_splicing_engine.py new file mode 100644 index 0000000..e3a6edf --- /dev/null +++ b/tests/test_splicing_engine.py @@ -0,0 +1,77 @@ +import pytest +from rdkit import Chem +from src.splicing.engine import splice_molecule + +def test_splice_benzene_methyl(): + """ + Test splicing a benzene scaffold (isotope 1) with a methyl fragment. + Scaffold: c1ccccc1[1*] (Phenyl radical-ish dummy) + Fragment: C* (Methyl radical-ish dummy) + Result: Cc1ccccc1 (Toluene) + """ + scaffold = Chem.MolFromSmiles("c1ccccc1[1*]") + fragment = Chem.MolFromSmiles("C*") + + assert scaffold is not None + assert fragment is not None + + product = splice_molecule(scaffold, fragment, position=1) + + # Expected result: Toluene + expected_smiles = "Cc1ccccc1" + expected_mol = Chem.MolFromSmiles(expected_smiles) + expected_canonical = Chem.MolToSmiles(expected_mol, isomericSmiles=True) + + product_canonical = Chem.MolToSmiles(product, isomericSmiles=True) + + assert product_canonical == expected_canonical + +def test_splice_missing_isotope(): + """Test that error is raised if the requested position is not found on scaffold.""" + scaffold = Chem.MolFromSmiles("c1ccccc1[2*]") # Isotope 2 + fragment = Chem.MolFromSmiles("C*") + + with pytest.raises(ValueError, match="Scaffold dummy atom with isotope 1 not found"): + splice_molecule(scaffold, fragment, position=1) + +def test_splice_no_fragment_dummy(): + """Test that error is raised if fragment has no dummy atom.""" + scaffold = Chem.MolFromSmiles("c1ccccc1[1*]") + fragment = Chem.MolFromSmiles("C") # Methane, no dummy + + with pytest.raises(ValueError, match="Fragment does not contain a dummy atom"): + splice_molecule(scaffold, fragment, position=1) + +def test_complex_splicing(): + """ + Test splicing with more complex structures. + Scaffold: Pyridine derivative n1cccc1CC[1*] + Fragment: Cyclopropane C1CC1* + Result: n1cccc1CCC1CC1 + """ + scaffold = Chem.MolFromSmiles("n1cccc1CC[1*]") + fragment = Chem.MolFromSmiles("*C1CC1") + + product = splice_molecule(scaffold, fragment, position=1) + + expected = Chem.MolFromSmiles("n1cccc1CCC1CC1") + + assert Chem.MolToSmiles(product) == Chem.MolToSmiles(expected) + +def test_scaffold_with_multiple_different_dummies(): + """ + Test splicing when scaffold has multiple dummies with different isotopes. + Scaffold: [1*]c1ccccc1[2*] + Fragment: C* + Target: Splicing at 1 should leave [2*] intact. + """ + scaffold = Chem.MolFromSmiles("[1*]c1ccccc1[2*]") + fragment = Chem.MolFromSmiles("C*") + + # Splice at 1 + product = splice_molecule(scaffold, fragment, position=1) + + # Expected: Cc1ccccc1[2*] + expected = Chem.MolFromSmiles("Cc1ccccc1[2*]") + + assert Chem.MolToSmiles(product) == Chem.MolToSmiles(expected)