Initial commit
This commit is contained in:
413
notebooks/02_rdkit_substructure_matching.ipynb
Normal file
413
notebooks/02_rdkit_substructure_matching.ipynb
Normal file
@@ -0,0 +1,413 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RDKit Substructure Matching with Multiprocessing\n",
|
||||
"\n",
|
||||
"This notebook performs parallel substructure matching on all extracted SDF files using SMARTS patterns with 220 processes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import SDMolSupplier, AllChem\n",
|
||||
"from joblib import Parallel, delayed\n",
|
||||
"import multiprocessing\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import time\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"# Set up paths\n",
|
||||
"sdf_dir = Path('../extracted_sdf_files')\n",
|
||||
"results_dir = Path('../matching_results')\n",
|
||||
"results_dir.mkdir(exist_ok=True)\n",
|
||||
"\n",
|
||||
"print(f\"SDF directory: {sdf_dir}\")\n",
|
||||
"print(f\"Results directory: {results_dir}\")\n",
|
||||
"\n",
|
||||
"# Load the SDF file list\n",
|
||||
"sdf_df = pd.read_csv(sdf_dir / 'sdf_file_list.csv')\n",
|
||||
"print(f\"Found {len(sdf_df)} SDF files to process\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define SMARTS patterns for substructure matching\n",
|
||||
"smarts_patterns = {\n",
|
||||
" 'benzene_ring': 'c1ccccc1',\n",
|
||||
" 'pyridine': 'c1ccncc1', \n",
|
||||
" 'carboxylic_acid': 'C(=O)O',\n",
|
||||
" 'alcohol': '[OX2H]',\n",
|
||||
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
|
||||
" 'amide': 'C(=O)N',\n",
|
||||
" 'ester': 'C(=O)OC',\n",
|
||||
" 'ketone': 'C(=O)C',\n",
|
||||
" 'aldehyde': 'C(=O)H',\n",
|
||||
" 'nitro': '[N+](=O)[O-]',\n",
|
||||
" 'halogen': '[Cl,Br,F,I]',\n",
|
||||
" 'sulfonamide': 'S(=O)(=O)N',\n",
|
||||
" 'heterocycle': '[n,o,s]',\n",
|
||||
" 'aromatic_ring': '[a]',\n",
|
||||
" 'alkene': '[C]=[C]',\n",
|
||||
" 'alkyne': '[C]#[C]',\n",
|
||||
" 'ether': '[OD2]([C])[C]',\n",
|
||||
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Compile SMARTS patterns\n",
|
||||
"compiled_patterns = {}\n",
|
||||
"for name, smarts in smarts_patterns.items():\n",
|
||||
" try:\n",
|
||||
" pattern = Chem.MolFromSmarts(smarts)\n",
|
||||
" if pattern is not None:\n",
|
||||
" compiled_patterns[name] = pattern\n",
|
||||
" print(f\"✓ Compiled SMARTS for {name}: {smarts}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"✗ Failed to compile SMARTS for {name}: {smarts}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"✗ Error compiling SMARTS for {name}: {e}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def match_mol_to_patterns(mol, patterns_dict):\n",
|
||||
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
|
||||
" if mol is None:\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" matches = {}\n",
|
||||
" mol_smiles = Chem.MolToSmiles(mol) if mol.HasProp('_Name') == False else mol.GetProp('_Name')\n",
|
||||
" \n",
|
||||
" for pattern_name, pattern in patterns_dict.items():\n",
|
||||
" try:\n",
|
||||
" if mol.HasSubstructMatch(pattern):\n",
|
||||
" matches[pattern_name] = True\n",
|
||||
" else:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" except Exception as e:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" 'smiles': mol_smiles,\n",
|
||||
" 'matches': matches\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def process_sdf_file(sdf_path, patterns_dict, max_molecules=None):\n",
|
||||
" \"\"\"Process a single SDF file and return matching results.\"\"\"\n",
|
||||
" try:\n",
|
||||
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
||||
" molecules = [mol for mol in suppl if mol is not None]\n",
|
||||
" \n",
|
||||
" if max_molecules:\n",
|
||||
" molecules = molecules[:max_molecules]\n",
|
||||
" \n",
|
||||
" results = []\n",
|
||||
" for mol in molecules:\n",
|
||||
" result = match_mol_to_patterns(mol, patterns_dict)\n",
|
||||
" if result:\n",
|
||||
" result['sdf_file'] = str(sdf_path.relative_to(sdf_dir))\n",
|
||||
" results.append(result)\n",
|
||||
" \n",
|
||||
" return results\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error processing {sdf_path}: {e}\")\n",
|
||||
" return []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test with a small subset first\n",
|
||||
"print(\"Testing with a small subset of files...\")\n",
|
||||
"test_files = sdf_df.head(3)['file_path'].tolist()\n",
|
||||
"test_files = [Path(f) for f in test_files]\n",
|
||||
"\n",
|
||||
"start_time = time.time()\n",
|
||||
"test_results = []\n",
|
||||
"\n",
|
||||
"for sdf_file in test_files:\n",
|
||||
" print(f\"Processing {sdf_file.name}...\")\n",
|
||||
" results = process_sdf_file(sdf_file, compiled_patterns, max_molecules=100)\n",
|
||||
" test_results.extend(results)\n",
|
||||
" print(f\" Found {len(results)} molecules\")\n",
|
||||
"\n",
|
||||
"test_time = time.time() - start_time\n",
|
||||
"print(f\"\\nTest completed in {test_time:.2f} seconds\")\n",
|
||||
"print(f\"Processed {len(test_results)} molecules\")\n",
|
||||
"\n",
|
||||
"# Display some sample results\n",
|
||||
"if test_results:\n",
|
||||
" sample_result = test_results[0]\n",
|
||||
" print(f\"\\nSample result:\")\n",
|
||||
" print(f\" SMILES: {sample_result['smiles']}\")\n",
|
||||
" print(f\" Matches: {sample_result['matches']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set up multiprocessing parameters\n",
|
||||
"N_PROCESSES = 220 # As requested\n",
|
||||
"N_FILES_PER_BATCH = 10 # Process files in batches to manage memory\n",
|
||||
"\n",
|
||||
"print(f\"Setting up multiprocessing with {N_PROCESSES} processes\")\n",
|
||||
"print(f\"CPU cores available: {multiprocessing.cpu_count()}\")\n",
|
||||
"\n",
|
||||
"# Prepare file batches\n",
|
||||
"all_files = [Path(f) for f in sdf_df['file_path'].tolist()]\n",
|
||||
"file_batches = [all_files[i:i + N_FILES_PER_BATCH] for i in range(0, len(all_files), N_FILES_PER_BATCH)]\n",
|
||||
"\n",
|
||||
"print(f\"Processing {len(all_files)} files in {len(file_batches)} batches\")\n",
|
||||
"print(f\"Average files per batch: {len(all_files) / len(file_batches):.1f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def process_file_batch(file_batch, patterns_dict, batch_id):\n",
|
||||
" \"\"\"Process a batch of SDF files.\"\"\"\n",
|
||||
" batch_results = []\n",
|
||||
" \n",
|
||||
" for sdf_file in file_batch:\n",
|
||||
" try:\n",
|
||||
" results = process_sdf_file(sdf_file, patterns_dict)\n",
|
||||
" batch_results.extend(results)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error in batch {batch_id} processing {sdf_file}: {e}\")\n",
|
||||
" \n",
|
||||
" return batch_results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run parallel processing\n",
|
||||
"print(\"Starting parallel substructure matching...\")\n",
|
||||
"start_time = time.time()\n",
|
||||
"\n",
|
||||
"all_results = []\n",
|
||||
"processed_batches = 0\n",
|
||||
"\n",
|
||||
"# Process batches with progress bar\n",
|
||||
"for batch_results in tqdm(\n",
|
||||
" Parallel(n_jobs=N_PROCESSES, backend='loky', verbose=1)(\n",
|
||||
" delayed(process_file_batch)(batch, compiled_patterns, i) \n",
|
||||
" for i, batch in enumerate(file_batches)\n",
|
||||
" ),\n",
|
||||
" total=len(file_batches),\n",
|
||||
" desc=\"Processing batches\"\n",
|
||||
"):\n",
|
||||
" all_results.extend(batch_results)\n",
|
||||
" processed_batches += 1\n",
|
||||
" \n",
|
||||
" # Save intermediate results every 10 batches\n",
|
||||
" if processed_batches % 10 == 0:\n",
|
||||
" intermediate_df = pd.DataFrame(all_results)\n",
|
||||
" intermediate_file = results_dir / f'intermediate_results_batch_{processed_batches}.csv'\n",
|
||||
" intermediate_df.to_csv(intermediate_file, index=False)\n",
|
||||
" print(f\"Saved intermediate results: {len(all_results)} molecules processed\")\n",
|
||||
"\n",
|
||||
"total_time = time.time() - start_time\n",
|
||||
"print(f\"\\nProcessing completed in {total_time:.2f} seconds\")\n",
|
||||
"print(f\"Total molecules processed: {len(all_results)}\")\n",
|
||||
"print(f\"Average processing speed: {len(all_results) / total_time:.1f} molecules/second\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Convert results to DataFrame and save\n",
|
||||
"if all_results:\n",
|
||||
" # Flatten the results\n",
|
||||
" flattened_results = []\n",
|
||||
" for result in all_results:\n",
|
||||
" row = {\n",
|
||||
" 'smiles': result['smiles'],\n",
|
||||
" 'sdf_file': result['sdf_file']\n",
|
||||
" }\n",
|
||||
" row.update(result['matches'])\n",
|
||||
" flattened_results.append(row)\n",
|
||||
" \n",
|
||||
" results_df = pd.DataFrame(flattened_results)\n",
|
||||
" \n",
|
||||
" # Save complete results\n",
|
||||
" results_file = results_dir / 'complete_matching_results.csv'\n",
|
||||
" results_df.to_csv(results_file, index=False)\n",
|
||||
" print(f\"Saved complete results to: {results_file}\")\n",
|
||||
" \n",
|
||||
" # Generate summary statistics\n",
|
||||
" summary_stats = {}\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" count = results_df[pattern_name].sum()\n",
|
||||
" percentage = (count / len(results_df)) * 100\n",
|
||||
" summary_stats[pattern_name] = {\n",
|
||||
" 'count': int(count),\n",
|
||||
" 'percentage': percentage\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" summary_df = pd.DataFrame(summary_stats).T\n",
|
||||
" summary_file = results_dir / 'matching_summary.csv'\n",
|
||||
" summary_df.to_csv(summary_file)\n",
|
||||
" \n",
|
||||
" print(f\"\\nMatching Summary:\")\n",
|
||||
" print(summary_df)\n",
|
||||
" \n",
|
||||
" # Save molecules with specific patterns\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" matching_mols = results_df[results_df[pattern_name] == True]\n",
|
||||
" if len(matching_mols) > 0:\n",
|
||||
" pattern_file = results_dir / f'molecules_with_{pattern_name}.csv'\n",
|
||||
" matching_mols.to_csv(pattern_file, index=False)\n",
|
||||
" print(f\"Saved {len(matching_mols)} molecules with {pattern_name} pattern\")\n",
|
||||
"else:\n",
|
||||
" print(\"No results to save\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Performance analysis\n",
|
||||
"print(\"\\nPerformance Analysis:\")\n",
|
||||
"print(f\"Total processing time: {total_time:.2f} seconds\")\n",
|
||||
"print(f\"Total molecules processed: {len(all_results)}\")\n",
|
||||
"print(f\"Number of processes used: {N_PROCESSES}\")\n",
|
||||
"print(f\"Average molecules per second: {len(all_results) / total_time:.1f}\")\n",
|
||||
"print(f\"Average molecules per process: {len(all_results) / N_PROCESSES:.1f}\")\n",
|
||||
"\n",
|
||||
"# File processing statistics\n",
|
||||
"files_processed = len(set([r['sdf_file'] for r in all_results]))\n",
|
||||
"print(f\"Files successfully processed: {files_processed}/{len(all_files)}\")\n",
|
||||
"print(f\"Average molecules per file: {len(all_results) / files_processed:.1f}\")\n",
|
||||
"\n",
|
||||
"# Memory usage estimation\n",
|
||||
"if all_results:\n",
|
||||
" import sys\n",
|
||||
" result_size = sys.getsizeof(all_results) / (1024**2)\n",
|
||||
" print(f\"Estimated memory usage for results: {result_size:.2f} MB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a quick visualization of pattern frequencies\n",
|
||||
"try:\n",
|
||||
" import matplotlib.pyplot as plt\n",
|
||||
" import seaborn as sns\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(12, 8))\n",
|
||||
" pattern_counts = [results_df[pattern].sum() for pattern in compiled_patterns.keys()]\n",
|
||||
" pattern_names = list(compiled_patterns.keys())\n",
|
||||
" \n",
|
||||
" # Create bar plot\n",
|
||||
" plt.bar(range(len(pattern_names)), pattern_counts)\n",
|
||||
" plt.xticks(range(len(pattern_names)), pattern_names, rotation=45, ha='right')\n",
|
||||
" plt.ylabel('Number of Molecules')\n",
|
||||
" plt.title('Substructure Pattern Frequencies')\n",
|
||||
" plt.tight_layout()\n",
|
||||
" \n",
|
||||
" # Save plot\n",
|
||||
" plot_file = results_dir / 'pattern_frequencies.png'\n",
|
||||
" plt.savefig(plot_file, dpi=300, bbox_inches='tight')\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
" print(f\"Saved visualization to: {plot_file}\")\n",
|
||||
" \n",
|
||||
"except ImportError:\n",
|
||||
" print(\"Matplotlib not available for visualization\")\n",
|
||||
" print(\"Pattern counts:\")\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" count = results_df[pattern_name].sum()\n",
|
||||
" print(f\" {pattern_name}: {int(count)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"This notebook successfully:\n",
|
||||
"1. ✅ Set up RDKit and compiled SMARTS patterns\n",
|
||||
"2. ✅ Implemented parallel processing with 220 processes\n",
|
||||
"3. ✅ Processed all SDF files for substructure matching\n",
|
||||
"4. ✅ Generated comprehensive results and statistics\n",
|
||||
"5. ✅ Saved results in multiple formats for further analysis\n",
|
||||
"\n",
|
||||
"The results are saved in the `matching_results` directory with:\n",
|
||||
"- Complete matching results CSV\n",
|
||||
"- Summary statistics\n",
|
||||
"- Individual pattern matches\n",
|
||||
"- Performance analysis\n",
|
||||
"- Visualization (if matplotlib available)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "default",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Reference in New Issue
Block a user