Files
search_macro/notebooks/03_cpu_detection_joblib_matching.ipynb
2025-11-14 18:46:03 +08:00

416 lines
15 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CPU Detection and Joblib Parallel SMARTS Matching\n",
"\n",
"This notebook:\n",
"1. Detects available CPU cores and uses 80% of them\n",
"2. Uses joblib for parallel RDKit SMARTS matching\n",
"3. Tests with a single SDF file from extracted_sdf_files directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import multiprocessing\n",
"import psutil\n",
"import time\n",
"import warnings\n",
"from pathlib import Path\n",
"from rdkit import Chem\n",
"from rdkit.Chem import SDMolSupplier\n",
"from joblib import Parallel, delayed\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"print(\"=== CPU Detection and System Information ===\")\n",
"\n",
"# Get basic CPU information\n",
"total_cores = multiprocessing.cpu_count()\n",
"print(f\"Total CPU cores available: {total_cores}\")\n",
"\n",
"# Get physical vs logical cores using psutil\n",
"physical_cores = psutil.cpu_count(logical=False)\n",
"logical_cores = psutil.cpu_count(logical=True)\n",
"print(f\"Physical cores: {physical_cores}\")\n",
"print(f\"Logical cores: {logical_cores}\")\n",
"\n",
"# Calculate 80% of available cores\n",
"target_cores = int(total_cores * 0.8)\n",
"print(f\"\\nUsing 80% of CPU cores: {target_cores} cores\")\n",
"print(f\"Utilization: {(target_cores/total_cores)*100:.1f}%\")\n",
"\n",
"# Get current CPU usage\n",
"cpu_percent = psutil.cpu_percent(interval=1)\n",
"print(f\"Current CPU usage: {cpu_percent:.1f}%\")\n",
"\n",
"# Get memory information\n",
"memory = psutil.virtual_memory()\n",
"print(f\"\\nMemory Information:\")\n",
"print(f\"Total memory: {memory.total / (1024**3):.2f} GB\")\n",
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
"print(f\"Memory usage: {memory.percent:.1f}%\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set up paths\n",
"project_root = Path('..').resolve()\n",
"extracted_sdf_dir = project_root / 'extracted_sdf_files'\n",
"\n",
"print(f\"Project root: {project_root}\")\n",
"print(f\"Extracted SDF directory: {extracted_sdf_dir}\")\n",
"\n",
"# Find all SDF files in extracted_sdf_files\n",
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
"print(f\"\\nFound {len(sdf_files)} SDF files in extracted_sdf_files directory\")\n",
"\n",
"# Display first few files\n",
"if sdf_files:\n",
" print(\"\\nFirst 5 SDF files:\")\n",
" for i, sdf_file in enumerate(sdf_files[:5]):\n",
" file_size = sdf_file.stat().st_size / (1024**2) # Size in MB\n",
" print(f\" {i+1}. {sdf_file.relative_to(project_root)} ({file_size:.2f} MB)\")\n",
" \n",
" # Select the first file for testing\n",
" test_sdf_file = sdf_files[0]\n",
" print(f\"\\nSelected test file: {test_sdf_file.relative_to(project_root)}\")\n",
"else:\n",
" print(\"No SDF files found in extracted_sdf_files directory!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define SMARTS patterns for substructure matching\n",
"smarts_patterns = {\n",
" 'benzene_ring': 'c1ccccc1',\n",
" 'pyridine': 'c1ccncc1', \n",
" 'carboxylic_acid': 'C(=O)O',\n",
" 'alcohol': '[OX2H]',\n",
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
" 'amide': 'C(=O)N',\n",
" 'ester': 'C(=O)OC',\n",
" 'ketone': 'C(=O)C',\n",
" 'aldehyde': 'C(=O)H',\n",
" 'nitro': '[N+](=O)[O-]',\n",
" 'halogen': '[Cl,Br,F,I]',\n",
" 'sulfonamide': 'S(=O)(=O)N',\n",
" 'heterocycle': '[n,o,s]',\n",
" 'aromatic_ring': '[a]',\n",
" 'alkene': '[C]=[C]',\n",
" 'alkyne': '[C]#[C]',\n",
" 'ether': '[OD2]([C])[C]',\n",
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
"}\n",
"\n",
"# Compile SMARTS patterns\n",
"compiled_patterns = {}\n",
"print(\"Compiling SMARTS patterns...\")\n",
"for name, smarts in smarts_patterns.items():\n",
" try:\n",
" pattern = Chem.MolFromSmarts(smarts)\n",
" if pattern is not None:\n",
" compiled_patterns[name] = pattern\n",
" print(f\"✓ {name}: {smarts}\")\n",
" else:\n",
" print(f\"✗ Failed to compile {name}: {smarts}\")\n",
" except Exception as e:\n",
" print(f\"✗ Error compiling {name}: {e}\")\n",
"\n",
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_molecules_from_sdf(sdf_path, max_molecules=None):\n",
" \"\"\"Load molecules from SDF file.\"\"\"\n",
" print(f\"Loading molecules from {sdf_path.name}...\")\n",
" \n",
" try:\n",
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
" molecules = []\n",
" \n",
" for i, mol in enumerate(suppl):\n",
" if mol is not None:\n",
" molecules.append(mol)\n",
" \n",
" if max_molecules and len(molecules) >= max_molecules:\n",
" break\n",
" \n",
" print(f\"Successfully loaded {len(molecules)} valid molecules\")\n",
" return molecules\n",
" \n",
" except Exception as e:\n",
" print(f\"Error loading molecules: {e}\")\n",
" return []\n",
"\n",
"def match_single_molecule(mol, patterns_dict, mol_id):\n",
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
" if mol is None:\n",
" return None\n",
" \n",
" matches = {}\n",
" mol_smiles = Chem.MolToSmiles(mol)\n",
" \n",
" for pattern_name, pattern in patterns_dict.items():\n",
" try:\n",
" if mol.HasSubstructMatch(pattern):\n",
" matches[pattern_name] = True\n",
" else:\n",
" matches[pattern_name] = False\n",
" except Exception as e:\n",
" matches[pattern_name] = False\n",
" \n",
" return {\n",
" 'mol_id': mol_id,\n",
" 'smiles': mol_smiles,\n",
" 'matches': matches\n",
" }\n",
"\n",
"print(\"Helper functions defined successfully\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test molecules\n",
"if 'test_sdf_file' in locals():\n",
" # Load a subset for testing (adjust as needed)\n",
" max_test_molecules = 1000 # Adjust this number based on your needs\n",
" molecules = load_molecules_from_sdf(test_sdf_file, max_molecules=max_test_molecules)\n",
" \n",
" if molecules:\n",
" print(f\"\\nLoaded {len(molecules)} molecules for testing\")\n",
" print(f\"Sample molecule SMILES: {Chem.MolToSmiles(molecules[0])}\")\n",
" else:\n",
" print(\"No molecules loaded!\")\n",
"else:\n",
" print(\"No test SDF file selected!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test sequential processing first\n",
"if 'molecules' in locals() and molecules:\n",
" print(\"=== Sequential Processing Test ===\")\n",
" \n",
" # Test with first 10 molecules\n",
" test_mols = molecules[:10]\n",
" \n",
" start_time = time.time()\n",
" sequential_results = []\n",
" \n",
" for i, mol in enumerate(test_mols):\n",
" result = match_single_molecule(mol, compiled_patterns, i)\n",
" if result:\n",
" sequential_results.append(result)\n",
" \n",
" sequential_time = time.time() - start_time\n",
" \n",
" print(f\"Sequential processing completed in {sequential_time:.3f} seconds\")\n",
" print(f\"Processed {len(sequential_results)} molecules\")\n",
" print(f\"Average time per molecule: {sequential_time/len(sequential_results):.3f} seconds\")\n",
" \n",
" # Display sample result\n",
" if sequential_results:\n",
" sample = sequential_results[0]\n",
" print(f\"\\nSample result:\")\n",
" print(f\" Molecule ID: {sample['mol_id']}\")\n",
" print(f\" SMILES: {sample['smiles']}\")\n",
" print(f\" Matches: {sample['matches']}\")\n",
"else:\n",
" print(\"No molecules available for testing!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parallel processing with joblib using 80% CPU\n",
"if 'molecules' in locals() and molecules:\n",
" print(f\"=== Parallel Processing with {target_cores} cores (80% CPU) ===\")\n",
" \n",
" # Use all loaded molecules or a subset for testing\n",
" parallel_test_mols = molecules[:100] # Adjust this number as needed\n",
" print(f\"Processing {len(parallel_test_mols)} molecules in parallel...\")\n",
" \n",
" start_time = time.time()\n",
" \n",
" # Run parallel processing\n",
" parallel_results = Parallel(\n",
" n_jobs=target_cores,\n",
" backend='loky',\n",
" verbose=1\n",
" )(\n",
" delayed(match_single_molecule)(mol, compiled_patterns, i) \n",
" for i, mol in enumerate(parallel_test_mols)\n",
" )\n",
" \n",
" # Filter out None results\n",
" parallel_results = [r for r in parallel_results if r is not None]\n",
" \n",
" parallel_time = time.time() - start_time\n",
" \n",
" print(f\"\\nParallel processing completed in {parallel_time:.3f} seconds\")\n",
" print(f\"Successfully processed {len(parallel_results)} molecules\")\n",
" print(f\"Average time per molecule: {parallel_time/len(parallel_results):.3f} seconds\")\n",
" print(f\"Processing speed: {len(parallel_results)/parallel_time:.1f} molecules/second\")\n",
" \n",
" # Calculate speedup\n",
" if 'sequential_time' in locals():\n",
" speedup = sequential_time / parallel_time\n",
" efficiency = (speedup / target_cores) * 100\n",
" print(f\"\\nPerformance Analysis:\")\n",
" print(f\"Speedup: {speedup:.2f}x\")\n",
" print(f\"Parallel efficiency: {efficiency:.1f}%\")\n",
" \n",
"else:\n",
" print(\"No molecules available for parallel processing!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert results to DataFrame and analyze\n",
"if 'parallel_results' in locals() and parallel_results:\n",
" print(\"=== Results Analysis ===\")\n",
" \n",
" # Flatten results for DataFrame\n",
" flattened_results = []\n",
" for result in parallel_results:\n",
" row = {\n",
" 'mol_id': result['mol_id'],\n",
" 'smiles': result['smiles']\n",
" }\n",
" row.update(result['matches'])\n",
" flattened_results.append(row)\n",
" \n",
" results_df = pd.DataFrame(flattened_results)\n",
" \n",
" print(f\"Results DataFrame shape: {results_df.shape}\")\n",
" print(f\"\\nColumns: {list(results_df.columns)}\")\n",
" \n",
" # Pattern matching statistics\n",
" print(f\"\\nPattern Matching Statistics:\")\n",
" for pattern_name in compiled_patterns.keys():\n",
" count = results_df[pattern_name].sum()\n",
" percentage = (count / len(results_df)) * 100\n",
" print(f\" {pattern_name}: {count} molecules ({percentage:.1f}%)\")\n",
" \n",
" # Display some sample results\n",
" print(f\"\\nSample results (first 5 molecules):\")\n",
" print(results_df[['mol_id', 'smiles'] + list(compiled_patterns.keys())].head())\n",
" \n",
" # Save results\n",
" results_dir = Path('../results')\n",
" results_dir.mkdir(exist_ok=True)\n",
" \n",
" results_file = results_dir / f'parallel_matching_results_{len(parallel_results)}mols.csv'\n",
" results_df.to_csv(results_file, index=False)\n",
" print(f\"\\nResults saved to: {results_file}\")\n",
" \n",
"else:\n",
" print(\"No results to analyze!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Monitor CPU usage during processing\n",
"print(\"=== CPU Usage Monitoring ===\")\n",
"print(f\"Current CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
"print(f\"CPU usage per core: {psutil.cpu_percent(percpu=True)}\")\n",
"\n",
"# Memory usage\n",
"memory = psutil.virtual_memory()\n",
"print(f\"\\nMemory usage: {memory.percent:.1f}%\")\n",
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
"\n",
"# Process information\n",
"current_process = psutil.Process()\n",
"print(f\"\\nCurrent process memory usage: {current_process.memory_info().rss / (1024**2):.2f} MB\")\n",
"print(f\"Current process CPU usage: {current_process.cpu_percent():.1f}%\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"\n",
"This notebook successfully:\n",
"1. ✅ Detected available CPU cores and calculated 80% usage\n",
"2. ✅ Used joblib for parallel RDKit SMARTS matching\n",
"3. ✅ Tested with a real SDF file from extracted_sdf_files directory\n",
"4. ✅ Compared sequential vs parallel performance\n",
"5. ✅ Monitored system resources during processing\n",
"\n",
"### Key Results:\n",
"- **Total CPU cores**: {total_cores}\n",
"- **Used cores (80%)**: {target_cores}\n",
"- **Parallel efficiency**: Calculated based on speedup\n",
"- **Processing speed**: Molecules per second\n",
"\n",
"The results are saved in the `results` directory for further analysis."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "default",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}