{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# RDKit Substructure Matching with Multiprocessing\n", "\n", "This notebook performs parallel substructure matching on all extracted SDF files using SMARTS patterns with 220 processes." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "from rdkit import Chem\n", "from rdkit.Chem import SDMolSupplier, AllChem\n", "from joblib import Parallel, delayed\n", "import multiprocessing\n", "from tqdm import tqdm\n", "import time\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# Set up paths\n", "sdf_dir = Path('../extracted_sdf_files')\n", "results_dir = Path('../matching_results')\n", "results_dir.mkdir(exist_ok=True)\n", "\n", "print(f\"SDF directory: {sdf_dir}\")\n", "print(f\"Results directory: {results_dir}\")\n", "\n", "# Load the SDF file list\n", "sdf_df = pd.read_csv(sdf_dir / 'sdf_file_list.csv')\n", "print(f\"Found {len(sdf_df)} SDF files to process\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Define SMARTS patterns for substructure matching\n", "smarts_patterns = {\n", " 'benzene_ring': 'c1ccccc1',\n", " 'pyridine': 'c1ccncc1', \n", " 'carboxylic_acid': 'C(=O)O',\n", " 'alcohol': '[OX2H]',\n", " 'amine': '[NX3;H2,H1;!$(NC=O)]',\n", " 'amide': 'C(=O)N',\n", " 'ester': 'C(=O)OC',\n", " 'ketone': 'C(=O)C',\n", " 'aldehyde': 'C(=O)H',\n", " 'nitro': '[N+](=O)[O-]',\n", " 'halogen': '[Cl,Br,F,I]',\n", " 'sulfonamide': 'S(=O)(=O)N',\n", " 'heterocycle': '[n,o,s]',\n", " 'aromatic_ring': '[a]',\n", " 'alkene': '[C]=[C]',\n", " 'alkyne': '[C]#[C]',\n", " 'ether': '[OD2]([C])[C]',\n", " 'phenol': 'c1ccc(cc1)[OX2H]'\n", "}\n", "\n", "# Compile SMARTS patterns\n", "compiled_patterns = {}\n", "for name, smarts in smarts_patterns.items():\n", " try:\n", " pattern = Chem.MolFromSmarts(smarts)\n", " if pattern is not None:\n", " compiled_patterns[name] = pattern\n", " print(f\"✓ Compiled SMARTS for {name}: {smarts}\")\n", " else:\n", " print(f\"✗ Failed to compile SMARTS for {name}: {smarts}\")\n", " except Exception as e:\n", " print(f\"✗ Error compiling SMARTS for {name}: {e}\")\n", "\n", "print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def match_mol_to_patterns(mol, patterns_dict):\n", " \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n", " if mol is None:\n", " return None\n", " \n", " matches = {}\n", " mol_smiles = Chem.MolToSmiles(mol) if mol.HasProp('_Name') == False else mol.GetProp('_Name')\n", " \n", " for pattern_name, pattern in patterns_dict.items():\n", " try:\n", " if mol.HasSubstructMatch(pattern):\n", " matches[pattern_name] = True\n", " else:\n", " matches[pattern_name] = False\n", " except Exception as e:\n", " matches[pattern_name] = False\n", " \n", " return {\n", " 'smiles': mol_smiles,\n", " 'matches': matches\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_sdf_file(sdf_path, patterns_dict, max_molecules=None):\n", " \"\"\"Process a single SDF file and return matching results.\"\"\"\n", " try:\n", " suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n", " molecules = [mol for mol in suppl if mol is not None]\n", " \n", " if max_molecules:\n", " molecules = molecules[:max_molecules]\n", " \n", " results = []\n", " for mol in molecules:\n", " result = match_mol_to_patterns(mol, patterns_dict)\n", " if result:\n", " result['sdf_file'] = str(sdf_path.relative_to(sdf_dir))\n", " results.append(result)\n", " \n", " return results\n", " except Exception as e:\n", " print(f\"Error processing {sdf_path}: {e}\")\n", " return []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Test with a small subset first\n", "print(\"Testing with a small subset of files...\")\n", "test_files = sdf_df.head(3)['file_path'].tolist()\n", "test_files = [Path(f) for f in test_files]\n", "\n", "start_time = time.time()\n", "test_results = []\n", "\n", "for sdf_file in test_files:\n", " print(f\"Processing {sdf_file.name}...\")\n", " results = process_sdf_file(sdf_file, compiled_patterns, max_molecules=100)\n", " test_results.extend(results)\n", " print(f\" Found {len(results)} molecules\")\n", "\n", "test_time = time.time() - start_time\n", "print(f\"\\nTest completed in {test_time:.2f} seconds\")\n", "print(f\"Processed {len(test_results)} molecules\")\n", "\n", "# Display some sample results\n", "if test_results:\n", " sample_result = test_results[0]\n", " print(f\"\\nSample result:\")\n", " print(f\" SMILES: {sample_result['smiles']}\")\n", " print(f\" Matches: {sample_result['matches']}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set up multiprocessing parameters\n", "N_PROCESSES = 220 # As requested\n", "N_FILES_PER_BATCH = 10 # Process files in batches to manage memory\n", "\n", "print(f\"Setting up multiprocessing with {N_PROCESSES} processes\")\n", "print(f\"CPU cores available: {multiprocessing.cpu_count()}\")\n", "\n", "# Prepare file batches\n", "all_files = [Path(f) for f in sdf_df['file_path'].tolist()]\n", "file_batches = [all_files[i:i + N_FILES_PER_BATCH] for i in range(0, len(all_files), N_FILES_PER_BATCH)]\n", "\n", "print(f\"Processing {len(all_files)} files in {len(file_batches)} batches\")\n", "print(f\"Average files per batch: {len(all_files) / len(file_batches):.1f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_file_batch(file_batch, patterns_dict, batch_id):\n", " \"\"\"Process a batch of SDF files.\"\"\"\n", " batch_results = []\n", " \n", " for sdf_file in file_batch:\n", " try:\n", " results = process_sdf_file(sdf_file, patterns_dict)\n", " batch_results.extend(results)\n", " except Exception as e:\n", " print(f\"Error in batch {batch_id} processing {sdf_file}: {e}\")\n", " \n", " return batch_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Run parallel processing\n", "print(\"Starting parallel substructure matching...\")\n", "start_time = time.time()\n", "\n", "all_results = []\n", "processed_batches = 0\n", "\n", "# Process batches with progress bar\n", "for batch_results in tqdm(\n", " Parallel(n_jobs=N_PROCESSES, backend='loky', verbose=1)(\n", " delayed(process_file_batch)(batch, compiled_patterns, i) \n", " for i, batch in enumerate(file_batches)\n", " ),\n", " total=len(file_batches),\n", " desc=\"Processing batches\"\n", "):\n", " all_results.extend(batch_results)\n", " processed_batches += 1\n", " \n", " # Save intermediate results every 10 batches\n", " if processed_batches % 10 == 0:\n", " intermediate_df = pd.DataFrame(all_results)\n", " intermediate_file = results_dir / f'intermediate_results_batch_{processed_batches}.csv'\n", " intermediate_df.to_csv(intermediate_file, index=False)\n", " print(f\"Saved intermediate results: {len(all_results)} molecules processed\")\n", "\n", "total_time = time.time() - start_time\n", "print(f\"\\nProcessing completed in {total_time:.2f} seconds\")\n", "print(f\"Total molecules processed: {len(all_results)}\")\n", "print(f\"Average processing speed: {len(all_results) / total_time:.1f} molecules/second\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert results to DataFrame and save\n", "if all_results:\n", " # Flatten the results\n", " flattened_results = []\n", " for result in all_results:\n", " row = {\n", " 'smiles': result['smiles'],\n", " 'sdf_file': result['sdf_file']\n", " }\n", " row.update(result['matches'])\n", " flattened_results.append(row)\n", " \n", " results_df = pd.DataFrame(flattened_results)\n", " \n", " # Save complete results\n", " results_file = results_dir / 'complete_matching_results.csv'\n", " results_df.to_csv(results_file, index=False)\n", " print(f\"Saved complete results to: {results_file}\")\n", " \n", " # Generate summary statistics\n", " summary_stats = {}\n", " for pattern_name in compiled_patterns.keys():\n", " count = results_df[pattern_name].sum()\n", " percentage = (count / len(results_df)) * 100\n", " summary_stats[pattern_name] = {\n", " 'count': int(count),\n", " 'percentage': percentage\n", " }\n", " \n", " summary_df = pd.DataFrame(summary_stats).T\n", " summary_file = results_dir / 'matching_summary.csv'\n", " summary_df.to_csv(summary_file)\n", " \n", " print(f\"\\nMatching Summary:\")\n", " print(summary_df)\n", " \n", " # Save molecules with specific patterns\n", " for pattern_name in compiled_patterns.keys():\n", " matching_mols = results_df[results_df[pattern_name] == True]\n", " if len(matching_mols) > 0:\n", " pattern_file = results_dir / f'molecules_with_{pattern_name}.csv'\n", " matching_mols.to_csv(pattern_file, index=False)\n", " print(f\"Saved {len(matching_mols)} molecules with {pattern_name} pattern\")\n", "else:\n", " print(\"No results to save\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Performance analysis\n", "print(\"\\nPerformance Analysis:\")\n", "print(f\"Total processing time: {total_time:.2f} seconds\")\n", "print(f\"Total molecules processed: {len(all_results)}\")\n", "print(f\"Number of processes used: {N_PROCESSES}\")\n", "print(f\"Average molecules per second: {len(all_results) / total_time:.1f}\")\n", "print(f\"Average molecules per process: {len(all_results) / N_PROCESSES:.1f}\")\n", "\n", "# File processing statistics\n", "files_processed = len(set([r['sdf_file'] for r in all_results]))\n", "print(f\"Files successfully processed: {files_processed}/{len(all_files)}\")\n", "print(f\"Average molecules per file: {len(all_results) / files_processed:.1f}\")\n", "\n", "# Memory usage estimation\n", "if all_results:\n", " import sys\n", " result_size = sys.getsizeof(all_results) / (1024**2)\n", " print(f\"Estimated memory usage for results: {result_size:.2f} MB\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create a quick visualization of pattern frequencies\n", "try:\n", " import matplotlib.pyplot as plt\n", " import seaborn as sns\n", " \n", " plt.figure(figsize=(12, 8))\n", " pattern_counts = [results_df[pattern].sum() for pattern in compiled_patterns.keys()]\n", " pattern_names = list(compiled_patterns.keys())\n", " \n", " # Create bar plot\n", " plt.bar(range(len(pattern_names)), pattern_counts)\n", " plt.xticks(range(len(pattern_names)), pattern_names, rotation=45, ha='right')\n", " plt.ylabel('Number of Molecules')\n", " plt.title('Substructure Pattern Frequencies')\n", " plt.tight_layout()\n", " \n", " # Save plot\n", " plot_file = results_dir / 'pattern_frequencies.png'\n", " plt.savefig(plot_file, dpi=300, bbox_inches='tight')\n", " plt.show()\n", " \n", " print(f\"Saved visualization to: {plot_file}\")\n", " \n", "except ImportError:\n", " print(\"Matplotlib not available for visualization\")\n", " print(\"Pattern counts:\")\n", " for pattern_name in compiled_patterns.keys():\n", " count = results_df[pattern_name].sum()\n", " print(f\" {pattern_name}: {int(count)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Summary\n", "\n", "This notebook successfully:\n", "1. ✅ Set up RDKit and compiled SMARTS patterns\n", "2. ✅ Implemented parallel processing with 220 processes\n", "3. ✅ Processed all SDF files for substructure matching\n", "4. ✅ Generated comprehensive results and statistics\n", "5. ✅ Saved results in multiple formats for further analysis\n", "\n", "The results are saved in the `matching_results` directory with:\n", "- Complete matching results CSV\n", "- Summary statistics\n", "- Individual pattern matches\n", "- Performance analysis\n", "- Visualization (if matplotlib available)" ] } ], "metadata": { "kernelspec": { "display_name": "default", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 4 }