431 lines
15 KiB
Plaintext
431 lines
15 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# RDKit Parallel Processing Analysis\n",
|
||
"\n",
|
||
"分析RDKit匹配场景的并行性和SDF读取性能瓶颈"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import time\n",
|
||
"import multiprocessing\n",
|
||
"import psutil\n",
|
||
"from pathlib import Path\n",
|
||
"from rdkit import Chem\n",
|
||
"from rdkit.Chem import SDMolSupplier\n",
|
||
"from joblib import Parallel, delayed\n",
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"from tqdm import tqdm\n",
|
||
"import matplotlib.pyplot as plt\n",
|
||
"\n",
|
||
"print(\"=== RDKit并行处理分析 ===\")\n",
|
||
"print(f\"可用CPU核心数: {multiprocessing.cpu_count()}\")\n",
|
||
"print(f\"使用80%核心: {int(multiprocessing.cpu_count() * 0.8)}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 1. SDF读取性能测试"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def test_sdf_loading_performance(sdf_path, molecule_counts=[100, 500, 1000, 5000, 10000]):\n",
|
||
" \"\"\"测试不同分子数量下的SDF加载性能\"\"\"\n",
|
||
" \n",
|
||
" loading_times = {}\n",
|
||
" memory_usage = {}\n",
|
||
" \n",
|
||
" print(f\"测试文件: {sdf_path.name}\")\n",
|
||
" print(\"分子数量\\t加载时间(s)\\t内存使用(MB)\\t速度(mol/s)\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
" \n",
|
||
" for count in molecule_counts:\n",
|
||
" # 记录开始时间和内存\n",
|
||
" start_time = time.time()\n",
|
||
" process = psutil.Process()\n",
|
||
" start_memory = process.memory_info().rss / (1024**2)\n",
|
||
" \n",
|
||
" # 加载分子\n",
|
||
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
||
" molecules = []\n",
|
||
" \n",
|
||
" for i, mol in enumerate(suppl):\n",
|
||
" if mol is not None:\n",
|
||
" molecules.append(mol)\n",
|
||
" if len(molecules) >= count:\n",
|
||
" break\n",
|
||
" \n",
|
||
" # 记录结束时间和内存\n",
|
||
" end_time = time.time()\n",
|
||
" end_memory = process.memory_info().rss / (1024**2)\n",
|
||
" \n",
|
||
" loading_time = end_time - start_time\n",
|
||
" memory_used = end_memory - start_memory\n",
|
||
" speed = len(molecules) / loading_time if loading_time > 0 else 0\n",
|
||
" \n",
|
||
" loading_times[count] = loading_time\n",
|
||
" memory_usage[count] = memory_used\n",
|
||
" \n",
|
||
" print(f\"{count}\\t\\t{loading_time:.3f}\\t\\t{memory_used:.1f}\\t\\t{speed:.1f}\")\n",
|
||
" \n",
|
||
" return loading_times, memory_usage\n",
|
||
"\n",
|
||
"# 找一个测试文件\n",
|
||
"extracted_sdf_dir = Path('../extracted_sdf_files')\n",
|
||
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
|
||
"\n",
|
||
"if sdf_files:\n",
|
||
" test_file = sdf_files[0]\n",
|
||
" loading_times, memory_usage = test_sdf_loading_performance(test_file)\n",
|
||
"else:\n",
|
||
" print(\"未找到SDF文件\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 2. RDKit匹配性能测试"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def test_matching_performance(molecules, patterns_dict, molecule_counts=[100, 500, 1000, 5000]):\n",
|
||
" \"\"\"测试不同分子数量下的匹配性能\"\"\"\n",
|
||
" \n",
|
||
" def match_single_molecule(mol, patterns):\n",
|
||
" matches = {}\n",
|
||
" for pattern_name, pattern in patterns.items():\n",
|
||
" try:\n",
|
||
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
|
||
" except:\n",
|
||
" matches[pattern_name] = False\n",
|
||
" return matches\n",
|
||
" \n",
|
||
" matching_times = {}\n",
|
||
" \n",
|
||
" print(\"分子数量\\t匹配时间(s)\\t速度(mol/s)\\t平均时间/分子(ms)\")\n",
|
||
" print(\"-\" * 60)\n",
|
||
" \n",
|
||
" for count in molecule_counts:\n",
|
||
" if count > len(molecules):\n",
|
||
" continue\n",
|
||
" \n",
|
||
" test_mols = molecules[:count]\n",
|
||
" \n",
|
||
" start_time = time.time()\n",
|
||
" \n",
|
||
" for mol in test_mols:\n",
|
||
" match_single_molecule(mol, patterns_dict)\n",
|
||
" \n",
|
||
" end_time = time.time()\n",
|
||
" matching_time = end_time - start_time\n",
|
||
" speed = count / matching_time if matching_time > 0 else 0\n",
|
||
" avg_time_per_mol = (matching_time / count) * 1000 # 转换为毫秒\n",
|
||
" \n",
|
||
" matching_times[count] = matching_time\n",
|
||
" \n",
|
||
" print(f\"{count}\\t\\t{matching_time:.3f}\\t\\t{speed:.1f}\\t\\t{avg_time_per_mol:.2f}\")\n",
|
||
" \n",
|
||
" return matching_times\n",
|
||
"\n",
|
||
"# 测试匹配性能\n",
|
||
"if 'test_file' in locals():\n",
|
||
" # 加载一些分子用于测试\n",
|
||
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
|
||
" test_molecules = [mol for mol in suppl if mol is not None][:10000]\n",
|
||
" \n",
|
||
" # 定义测试SMARTS模式\n",
|
||
" test_patterns = {\n",
|
||
" 'benzene': Chem.MolFromSmarts('c1ccccc1'),\n",
|
||
" 'alcohol': Chem.MolFromSmarts('[OX2H]'),\n",
|
||
" 'carboxylic_acid': Chem.MolFromSmarts('C(=O)O'),\n",
|
||
" 'amine': Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]')\n",
|
||
" }\n",
|
||
" \n",
|
||
" print(f\"\\n=== 匹配性能测试 (使用{len(test_patterns)}个SMARTS模式) ===\")\n",
|
||
" matching_times = test_matching_performance(test_molecules, test_patterns)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 3. 并行vs串行性能对比"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def compare_parallel_vs_serial(molecules, patterns_dict, n_jobs_list=[1, 2, 4, 8, 16]):\n",
|
||
" \"\"\"比较不同并行度下的性能\"\"\"\n",
|
||
" \n",
|
||
" def match_single_molecule(mol, patterns):\n",
|
||
" matches = {}\n",
|
||
" for pattern_name, pattern in patterns.items():\n",
|
||
" try:\n",
|
||
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
|
||
" except:\n",
|
||
" matches[pattern_name] = False\n",
|
||
" return matches\n",
|
||
" \n",
|
||
" test_mols = molecules[:1000] # 使用1000个分子测试\n",
|
||
" results = {}\n",
|
||
" \n",
|
||
" print(f\"测试分子数量: {len(test_mols)}\")\n",
|
||
" print(f\"SMARTS模式数量: {len(patterns_dict)}\")\n",
|
||
" print(\"\\n并行度\\t时间(s)\\t速度(mol/s)\\t加速比\\t效率(%)\")\n",
|
||
" print(\"-\" * 65)\n",
|
||
" \n",
|
||
" serial_time = None\n",
|
||
" \n",
|
||
" for n_jobs in n_jobs_list:\n",
|
||
" start_time = time.time()\n",
|
||
" \n",
|
||
" if n_jobs == 1:\n",
|
||
" # 串行处理\n",
|
||
" for mol in test_mols:\n",
|
||
" match_single_molecule(mol, patterns_dict)\n",
|
||
" else:\n",
|
||
" # 并行处理\n",
|
||
" Parallel(n_jobs=n_jobs, backend='loky')(\n",
|
||
" delayed(match_single_molecule)(mol, patterns_dict) \n",
|
||
" for mol in test_mols\n",
|
||
" )\n",
|
||
" \n",
|
||
" end_time = time.time()\n",
|
||
" processing_time = end_time - start_time\n",
|
||
" speed = len(test_mols) / processing_time\n",
|
||
" \n",
|
||
" if serial_time is None:\n",
|
||
" serial_time = processing_time\n",
|
||
" speedup = 1.0\n",
|
||
" efficiency = 100.0\n",
|
||
" else:\n",
|
||
" speedup = serial_time / processing_time\n",
|
||
" efficiency = (speedup / n_jobs) * 100\n",
|
||
" \n",
|
||
" results[n_jobs] = {\n",
|
||
" 'time': processing_time,\n",
|
||
" 'speed': speed,\n",
|
||
" 'speedup': speedup,\n",
|
||
" 'efficiency': efficiency\n",
|
||
" }\n",
|
||
" \n",
|
||
" print(f\"{n_jobs}\\t\\t{processing_time:.3f}\\t\\t{speed:.1f}\\t\\t{speedup:.2f}x\\t{efficiency:.1f}\")\n",
|
||
" \n",
|
||
" return results\n",
|
||
"\n",
|
||
"# 运行并行vs串行对比\n",
|
||
"if 'test_molecules' in locals() and 'test_patterns' in locals():\n",
|
||
" print(\"\\n=== 并行vs串行性能对比 ===\")\n",
|
||
" max_jobs = min(16, int(multiprocessing.cpu_count() * 0.8))\n",
|
||
" n_jobs_list = [1, 2, 4, 8, max_jobs] if max_jobs > 8 else [1, 2, 4, max_jobs]\n",
|
||
" \n",
|
||
" parallel_results = compare_parallel_vs_serial(test_molecules, test_patterns, n_jobs_list)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 4. I/O瓶颈分析"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def analyze_io_bottleneck():\n",
|
||
" \"\"\"分析I/O瓶颈\"\"\"\n",
|
||
" \n",
|
||
" print(\"=== I/O瓶颈分析 ===\")\n",
|
||
" \n",
|
||
" # 测试文件读取速度\n",
|
||
" if 'test_file' in locals():\n",
|
||
" file_size = test_file.stat().st_size / (1024**2) # MB\n",
|
||
" \n",
|
||
" print(f\"\\n文件信息:\")\n",
|
||
" print(f\"文件大小: {file_size:.2f} MB\")\n",
|
||
" print(f\"文件路径: {test_file}\")\n",
|
||
" \n",
|
||
" # 测试纯文件读取速度\n",
|
||
" start_time = time.time()\n",
|
||
" with open(test_file, 'rb') as f:\n",
|
||
" raw_data = f.read()\n",
|
||
" raw_read_time = time.time() - start_time\n",
|
||
" raw_read_speed = file_size / raw_read_time\n",
|
||
" \n",
|
||
" print(f\"\\n纯文件读取:\")\n",
|
||
" print(f\"读取时间: {raw_read_time:.3f} s\")\n",
|
||
" print(f\"读取速度: {raw_read_speed:.2f} MB/s\")\n",
|
||
" \n",
|
||
" # 测试RDKit解析速度\n",
|
||
" start_time = time.time()\n",
|
||
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
|
||
" molecules = [mol for mol in suppl if mol is not None]\n",
|
||
" parse_time = time.time() - start_time\n",
|
||
" parse_speed = len(molecules) / parse_time\n",
|
||
" \n",
|
||
" print(f\"\\nRDKit解析:\")\n",
|
||
" print(f\"解析时间: {parse_time:.3f} s\")\n",
|
||
" print(f\"分子数量: {len(molecules)}\")\n",
|
||
" print(f\"解析速度: {parse_speed:.1f} mol/s\")\n",
|
||
" \n",
|
||
" # 计算I/O占比\n",
|
||
" io_percentage = (raw_read_time / parse_time) * 100\n",
|
||
" print(f\"\\nI/O时间占比: {io_percentage:.1f}%\")\n",
|
||
" \n",
|
||
" if io_percentage > 50:\n",
|
||
" print(\"⚠️ I/O是主要瓶颈,建议:\")\n",
|
||
" print(\" - 使用更快的存储(SSD)\")\n",
|
||
" print(\" - 预处理SDF文件为其他格式\")\n",
|
||
" print(\" - 使用内存映射\")\n",
|
||
" else:\n",
|
||
" print(\"✅ 计算是主要瓶颈,并行化有效\")\n",
|
||
"\n",
|
||
"analyze_io_bottleneck()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 5. 优化建议"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def provide_optimization_recommendations():\n",
|
||
" \"\"\"提供优化建议\"\"\"\n",
|
||
" \n",
|
||
" print(\"=== RDKit并行处理优化建议 ===\")\n",
|
||
" \n",
|
||
" recommendations = [\n",
|
||
" {\n",
|
||
" \"问题\": \"SDF文件读取慢\",\n",
|
||
" \"原因\": \"RDKit需要解析分子结构,sanitization耗时\",\n",
|
||
" \"解决方案\": [\n",
|
||
" \"1. 预处理SDF为pickle/feather格式\",\n",
|
||
" \"2. 禁用不必要的sanitization\",\n",
|
||
" \"3. 使用更快的存储介质\",\n",
|
||
" \"4. 分批读取避免内存溢出\"\n",
|
||
" ]\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"问题\": \"并行效率低\",\n",
|
||
" \"原因\": \"RDKit的GIL限制和进程间通信开销\",\n",
|
||
" \"解决方案\": [\n",
|
||
" \"1. 使用多进程而非多线程\",\n",
|
||
" \"2. 增大每个任务的粒度\",\n",
|
||
" \"3. 预编译SMARTS模式\",\n",
|
||
" \"4. 使用loky backend减少开销\"\n",
|
||
" ]\n",
|
||
" },\n",
|
||
" {\n",
|
||
" \"问题\": \"内存使用过高\",\n",
|
||
" \"原因\": \"分子对象在内存中占用空间大\",\n",
|
||
" \"解决方案\": [\n",
|
||
" \"1. 分批处理\",\n",
|
||
" \"2. 及时释放不需要的分子\",\n",
|
||
" \"3. 使用生成器而非列表\",\n",
|
||
" \"4. 考虑使用SMILES字符串替代分子对象\"\n",
|
||
" ]\n",
|
||
" }\n",
|
||
" ]\n",
|
||
" \n",
|
||
" for i, rec in enumerate(recommendations, 1):\n",
|
||
" print(f\"\\n{i}. {rec['问题']}\")\n",
|
||
" print(f\" 原因: {rec['原因']}\")\n",
|
||
" print(f\" 解决方案:\")\n",
|
||
" for solution in rec['解决方案']:\n",
|
||
" print(f\" {solution}\")\n",
|
||
" \n",
|
||
" print(f\"\\n=== 最佳实践总结 ===\")\n",
|
||
" print(\"1. 🚀 对于大量小分子: 并行处理效果好\")\n",
|
||
" print(\"2. 📁 对于大文件: 预处理和分批读取更重要\")\n",
|
||
" print(\"3. 💾 内存受限: 使用流式处理和生成器\")\n",
|
||
" print(\"4. ⚡ CPU密集: 适当增加并行度,但避免过度并行\")\n",
|
||
" print(\"5. 🔄 I/O密集: 优化存储和文件格式\")\n",
|
||
"\n",
|
||
"provide_optimization_recommendations()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## 结论\n",
|
||
"\n",
|
||
"### RDKit + Joblib 并行性分析:\n",
|
||
"\n",
|
||
"1. **✅ 可以并行**: RDKit的分子匹配是CPU密集型任务,适合并行化\n",
|
||
"2. **⚠️ 有效但有局限**: 由于GIL和进程开销,并行效率通常在60-80%\n",
|
||
"3. **📊 最佳并行度**: 通常为CPU核心数的50-80%\n",
|
||
"\n",
|
||
"### SDF读取性能:\n",
|
||
"\n",
|
||
"1. **🐌 相对较慢**: SDF解析需要分子sanitization,比纯文本读取慢10-100倍\n",
|
||
"2. **💾 内存密集**: 每个分子对象在内存中占用较大空间\n",
|
||
"3. **🔄 I/O瓶颈**: 对于大文件,I/O可能成为主要瓶颈\n",
|
||
"\n",
|
||
"### 推荐策略:\n",
|
||
"\n",
|
||
"- **小文件+多模式**: 并行处理效果好\n",
|
||
"- **大文件+少模式**: 优化I/O更重要\n",
|
||
"- **混合策略**: 预处理 + 并行匹配 + 分批处理"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "search_macro",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.0"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|