416 lines
15 KiB
Plaintext
416 lines
15 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# CPU Detection and Joblib Parallel SMARTS Matching\n",
|
|
"\n",
|
|
"This notebook:\n",
|
|
"1. Detects available CPU cores and uses 80% of them\n",
|
|
"2. Uses joblib for parallel RDKit SMARTS matching\n",
|
|
"3. Tests with a single SDF file from extracted_sdf_files directory"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import multiprocessing\n",
|
|
"import psutil\n",
|
|
"import time\n",
|
|
"import warnings\n",
|
|
"from pathlib import Path\n",
|
|
"from rdkit import Chem\n",
|
|
"from rdkit.Chem import SDMolSupplier\n",
|
|
"from joblib import Parallel, delayed\n",
|
|
"from tqdm import tqdm\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"warnings.filterwarnings('ignore')\n",
|
|
"\n",
|
|
"print(\"=== CPU Detection and System Information ===\")\n",
|
|
"\n",
|
|
"# Get basic CPU information\n",
|
|
"total_cores = multiprocessing.cpu_count()\n",
|
|
"print(f\"Total CPU cores available: {total_cores}\")\n",
|
|
"\n",
|
|
"# Get physical vs logical cores using psutil\n",
|
|
"physical_cores = psutil.cpu_count(logical=False)\n",
|
|
"logical_cores = psutil.cpu_count(logical=True)\n",
|
|
"print(f\"Physical cores: {physical_cores}\")\n",
|
|
"print(f\"Logical cores: {logical_cores}\")\n",
|
|
"\n",
|
|
"# Calculate 80% of available cores\n",
|
|
"target_cores = int(total_cores * 0.8)\n",
|
|
"print(f\"\\nUsing 80% of CPU cores: {target_cores} cores\")\n",
|
|
"print(f\"Utilization: {(target_cores/total_cores)*100:.1f}%\")\n",
|
|
"\n",
|
|
"# Get current CPU usage\n",
|
|
"cpu_percent = psutil.cpu_percent(interval=1)\n",
|
|
"print(f\"Current CPU usage: {cpu_percent:.1f}%\")\n",
|
|
"\n",
|
|
"# Get memory information\n",
|
|
"memory = psutil.virtual_memory()\n",
|
|
"print(f\"\\nMemory Information:\")\n",
|
|
"print(f\"Total memory: {memory.total / (1024**3):.2f} GB\")\n",
|
|
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
|
|
"print(f\"Memory usage: {memory.percent:.1f}%\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Set up paths\n",
|
|
"project_root = Path('..').resolve()\n",
|
|
"extracted_sdf_dir = project_root / 'extracted_sdf_files'\n",
|
|
"\n",
|
|
"print(f\"Project root: {project_root}\")\n",
|
|
"print(f\"Extracted SDF directory: {extracted_sdf_dir}\")\n",
|
|
"\n",
|
|
"# Find all SDF files in extracted_sdf_files\n",
|
|
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
|
|
"print(f\"\\nFound {len(sdf_files)} SDF files in extracted_sdf_files directory\")\n",
|
|
"\n",
|
|
"# Display first few files\n",
|
|
"if sdf_files:\n",
|
|
" print(\"\\nFirst 5 SDF files:\")\n",
|
|
" for i, sdf_file in enumerate(sdf_files[:5]):\n",
|
|
" file_size = sdf_file.stat().st_size / (1024**2) # Size in MB\n",
|
|
" print(f\" {i+1}. {sdf_file.relative_to(project_root)} ({file_size:.2f} MB)\")\n",
|
|
" \n",
|
|
" # Select the first file for testing\n",
|
|
" test_sdf_file = sdf_files[0]\n",
|
|
" print(f\"\\nSelected test file: {test_sdf_file.relative_to(project_root)}\")\n",
|
|
"else:\n",
|
|
" print(\"No SDF files found in extracted_sdf_files directory!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Define SMARTS patterns for substructure matching\n",
|
|
"smarts_patterns = {\n",
|
|
" 'benzene_ring': 'c1ccccc1',\n",
|
|
" 'pyridine': 'c1ccncc1', \n",
|
|
" 'carboxylic_acid': 'C(=O)O',\n",
|
|
" 'alcohol': '[OX2H]',\n",
|
|
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
|
|
" 'amide': 'C(=O)N',\n",
|
|
" 'ester': 'C(=O)OC',\n",
|
|
" 'ketone': 'C(=O)C',\n",
|
|
" 'aldehyde': 'C(=O)H',\n",
|
|
" 'nitro': '[N+](=O)[O-]',\n",
|
|
" 'halogen': '[Cl,Br,F,I]',\n",
|
|
" 'sulfonamide': 'S(=O)(=O)N',\n",
|
|
" 'heterocycle': '[n,o,s]',\n",
|
|
" 'aromatic_ring': '[a]',\n",
|
|
" 'alkene': '[C]=[C]',\n",
|
|
" 'alkyne': '[C]#[C]',\n",
|
|
" 'ether': '[OD2]([C])[C]',\n",
|
|
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Compile SMARTS patterns\n",
|
|
"compiled_patterns = {}\n",
|
|
"print(\"Compiling SMARTS patterns...\")\n",
|
|
"for name, smarts in smarts_patterns.items():\n",
|
|
" try:\n",
|
|
" pattern = Chem.MolFromSmarts(smarts)\n",
|
|
" if pattern is not None:\n",
|
|
" compiled_patterns[name] = pattern\n",
|
|
" print(f\"✓ {name}: {smarts}\")\n",
|
|
" else:\n",
|
|
" print(f\"✗ Failed to compile {name}: {smarts}\")\n",
|
|
" except Exception as e:\n",
|
|
" print(f\"✗ Error compiling {name}: {e}\")\n",
|
|
"\n",
|
|
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def load_molecules_from_sdf(sdf_path, max_molecules=None):\n",
|
|
" \"\"\"Load molecules from SDF file.\"\"\"\n",
|
|
" print(f\"Loading molecules from {sdf_path.name}...\")\n",
|
|
" \n",
|
|
" try:\n",
|
|
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
|
" molecules = []\n",
|
|
" \n",
|
|
" for i, mol in enumerate(suppl):\n",
|
|
" if mol is not None:\n",
|
|
" molecules.append(mol)\n",
|
|
" \n",
|
|
" if max_molecules and len(molecules) >= max_molecules:\n",
|
|
" break\n",
|
|
" \n",
|
|
" print(f\"Successfully loaded {len(molecules)} valid molecules\")\n",
|
|
" return molecules\n",
|
|
" \n",
|
|
" except Exception as e:\n",
|
|
" print(f\"Error loading molecules: {e}\")\n",
|
|
" return []\n",
|
|
"\n",
|
|
"def match_single_molecule(mol, patterns_dict, mol_id):\n",
|
|
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
|
|
" if mol is None:\n",
|
|
" return None\n",
|
|
" \n",
|
|
" matches = {}\n",
|
|
" mol_smiles = Chem.MolToSmiles(mol)\n",
|
|
" \n",
|
|
" for pattern_name, pattern in patterns_dict.items():\n",
|
|
" try:\n",
|
|
" if mol.HasSubstructMatch(pattern):\n",
|
|
" matches[pattern_name] = True\n",
|
|
" else:\n",
|
|
" matches[pattern_name] = False\n",
|
|
" except Exception as e:\n",
|
|
" matches[pattern_name] = False\n",
|
|
" \n",
|
|
" return {\n",
|
|
" 'mol_id': mol_id,\n",
|
|
" 'smiles': mol_smiles,\n",
|
|
" 'matches': matches\n",
|
|
" }\n",
|
|
"\n",
|
|
"print(\"Helper functions defined successfully\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load test molecules\n",
|
|
"if 'test_sdf_file' in locals():\n",
|
|
" # Load a subset for testing (adjust as needed)\n",
|
|
" max_test_molecules = 1000 # Adjust this number based on your needs\n",
|
|
" molecules = load_molecules_from_sdf(test_sdf_file, max_molecules=max_test_molecules)\n",
|
|
" \n",
|
|
" if molecules:\n",
|
|
" print(f\"\\nLoaded {len(molecules)} molecules for testing\")\n",
|
|
" print(f\"Sample molecule SMILES: {Chem.MolToSmiles(molecules[0])}\")\n",
|
|
" else:\n",
|
|
" print(\"No molecules loaded!\")\n",
|
|
"else:\n",
|
|
" print(\"No test SDF file selected!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Test sequential processing first\n",
|
|
"if 'molecules' in locals() and molecules:\n",
|
|
" print(\"=== Sequential Processing Test ===\")\n",
|
|
" \n",
|
|
" # Test with first 10 molecules\n",
|
|
" test_mols = molecules[:10]\n",
|
|
" \n",
|
|
" start_time = time.time()\n",
|
|
" sequential_results = []\n",
|
|
" \n",
|
|
" for i, mol in enumerate(test_mols):\n",
|
|
" result = match_single_molecule(mol, compiled_patterns, i)\n",
|
|
" if result:\n",
|
|
" sequential_results.append(result)\n",
|
|
" \n",
|
|
" sequential_time = time.time() - start_time\n",
|
|
" \n",
|
|
" print(f\"Sequential processing completed in {sequential_time:.3f} seconds\")\n",
|
|
" print(f\"Processed {len(sequential_results)} molecules\")\n",
|
|
" print(f\"Average time per molecule: {sequential_time/len(sequential_results):.3f} seconds\")\n",
|
|
" \n",
|
|
" # Display sample result\n",
|
|
" if sequential_results:\n",
|
|
" sample = sequential_results[0]\n",
|
|
" print(f\"\\nSample result:\")\n",
|
|
" print(f\" Molecule ID: {sample['mol_id']}\")\n",
|
|
" print(f\" SMILES: {sample['smiles']}\")\n",
|
|
" print(f\" Matches: {sample['matches']}\")\n",
|
|
"else:\n",
|
|
" print(\"No molecules available for testing!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Parallel processing with joblib using 80% CPU\n",
|
|
"if 'molecules' in locals() and molecules:\n",
|
|
" print(f\"=== Parallel Processing with {target_cores} cores (80% CPU) ===\")\n",
|
|
" \n",
|
|
" # Use all loaded molecules or a subset for testing\n",
|
|
" parallel_test_mols = molecules[:100] # Adjust this number as needed\n",
|
|
" print(f\"Processing {len(parallel_test_mols)} molecules in parallel...\")\n",
|
|
" \n",
|
|
" start_time = time.time()\n",
|
|
" \n",
|
|
" # Run parallel processing\n",
|
|
" parallel_results = Parallel(\n",
|
|
" n_jobs=target_cores,\n",
|
|
" backend='loky',\n",
|
|
" verbose=1\n",
|
|
" )(\n",
|
|
" delayed(match_single_molecule)(mol, compiled_patterns, i) \n",
|
|
" for i, mol in enumerate(parallel_test_mols)\n",
|
|
" )\n",
|
|
" \n",
|
|
" # Filter out None results\n",
|
|
" parallel_results = [r for r in parallel_results if r is not None]\n",
|
|
" \n",
|
|
" parallel_time = time.time() - start_time\n",
|
|
" \n",
|
|
" print(f\"\\nParallel processing completed in {parallel_time:.3f} seconds\")\n",
|
|
" print(f\"Successfully processed {len(parallel_results)} molecules\")\n",
|
|
" print(f\"Average time per molecule: {parallel_time/len(parallel_results):.3f} seconds\")\n",
|
|
" print(f\"Processing speed: {len(parallel_results)/parallel_time:.1f} molecules/second\")\n",
|
|
" \n",
|
|
" # Calculate speedup\n",
|
|
" if 'sequential_time' in locals():\n",
|
|
" speedup = sequential_time / parallel_time\n",
|
|
" efficiency = (speedup / target_cores) * 100\n",
|
|
" print(f\"\\nPerformance Analysis:\")\n",
|
|
" print(f\"Speedup: {speedup:.2f}x\")\n",
|
|
" print(f\"Parallel efficiency: {efficiency:.1f}%\")\n",
|
|
" \n",
|
|
"else:\n",
|
|
" print(\"No molecules available for parallel processing!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Convert results to DataFrame and analyze\n",
|
|
"if 'parallel_results' in locals() and parallel_results:\n",
|
|
" print(\"=== Results Analysis ===\")\n",
|
|
" \n",
|
|
" # Flatten results for DataFrame\n",
|
|
" flattened_results = []\n",
|
|
" for result in parallel_results:\n",
|
|
" row = {\n",
|
|
" 'mol_id': result['mol_id'],\n",
|
|
" 'smiles': result['smiles']\n",
|
|
" }\n",
|
|
" row.update(result['matches'])\n",
|
|
" flattened_results.append(row)\n",
|
|
" \n",
|
|
" results_df = pd.DataFrame(flattened_results)\n",
|
|
" \n",
|
|
" print(f\"Results DataFrame shape: {results_df.shape}\")\n",
|
|
" print(f\"\\nColumns: {list(results_df.columns)}\")\n",
|
|
" \n",
|
|
" # Pattern matching statistics\n",
|
|
" print(f\"\\nPattern Matching Statistics:\")\n",
|
|
" for pattern_name in compiled_patterns.keys():\n",
|
|
" count = results_df[pattern_name].sum()\n",
|
|
" percentage = (count / len(results_df)) * 100\n",
|
|
" print(f\" {pattern_name}: {count} molecules ({percentage:.1f}%)\")\n",
|
|
" \n",
|
|
" # Display some sample results\n",
|
|
" print(f\"\\nSample results (first 5 molecules):\")\n",
|
|
" print(results_df[['mol_id', 'smiles'] + list(compiled_patterns.keys())].head())\n",
|
|
" \n",
|
|
" # Save results\n",
|
|
" results_dir = Path('../results')\n",
|
|
" results_dir.mkdir(exist_ok=True)\n",
|
|
" \n",
|
|
" results_file = results_dir / f'parallel_matching_results_{len(parallel_results)}mols.csv'\n",
|
|
" results_df.to_csv(results_file, index=False)\n",
|
|
" print(f\"\\nResults saved to: {results_file}\")\n",
|
|
" \n",
|
|
"else:\n",
|
|
" print(\"No results to analyze!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Monitor CPU usage during processing\n",
|
|
"print(\"=== CPU Usage Monitoring ===\")\n",
|
|
"print(f\"Current CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
|
|
"print(f\"CPU usage per core: {psutil.cpu_percent(percpu=True)}\")\n",
|
|
"\n",
|
|
"# Memory usage\n",
|
|
"memory = psutil.virtual_memory()\n",
|
|
"print(f\"\\nMemory usage: {memory.percent:.1f}%\")\n",
|
|
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
|
|
"\n",
|
|
"# Process information\n",
|
|
"current_process = psutil.Process()\n",
|
|
"print(f\"\\nCurrent process memory usage: {current_process.memory_info().rss / (1024**2):.2f} MB\")\n",
|
|
"print(f\"Current process CPU usage: {current_process.cpu_percent():.1f}%\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Summary\n",
|
|
"\n",
|
|
"This notebook successfully:\n",
|
|
"1. ✅ Detected available CPU cores and calculated 80% usage\n",
|
|
"2. ✅ Used joblib for parallel RDKit SMARTS matching\n",
|
|
"3. ✅ Tested with a real SDF file from extracted_sdf_files directory\n",
|
|
"4. ✅ Compared sequential vs parallel performance\n",
|
|
"5. ✅ Monitored system resources during processing\n",
|
|
"\n",
|
|
"### Key Results:\n",
|
|
"- **Total CPU cores**: {total_cores}\n",
|
|
"- **Used cores (80%)**: {target_cores}\n",
|
|
"- **Parallel efficiency**: Calculated based on speedup\n",
|
|
"- **Processing speed**: Molecules per second\n",
|
|
"\n",
|
|
"The results are saved in the `results` directory for further analysis."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "default",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.14.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|