Initial commit
This commit is contained in:
565
notebooks/01_extract_sdf_files.ipynb
Normal file
565
notebooks/01_extract_sdf_files.ipynb
Normal file
@@ -0,0 +1,565 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Extract SDF Files from ZIP Archives\n",
|
||||
"\n",
|
||||
"This notebook extracts all SDF files from ZIP archives and collects existing SDF files into a unified directory."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Data directory: ../data\n",
|
||||
"Output directory: ../extracted_sdf_files\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"import rarfile\n",
|
||||
"import tarfile\n",
|
||||
"import gzip\n",
|
||||
"import shutil\n",
|
||||
"from pathlib import Path\n",
|
||||
"import os\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Set up paths\n",
|
||||
"data_dir = Path('../data')\n",
|
||||
"output_dir = Path('../extracted_sdf_files')\n",
|
||||
"output_dir.mkdir(exist_ok=True)\n",
|
||||
"\n",
|
||||
"print(f\"Data directory: {data_dir}\")\n",
|
||||
"print(f\"Output directory: {output_dir}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Searching for compressed files and SDF files...\n",
|
||||
"Found 23 compressed files\n",
|
||||
"Found 29 existing SDF files\n",
|
||||
"\n",
|
||||
"Compressed files:\n",
|
||||
" 1. V1ePm5EdwFOnQFwMpOFJk.tar.gz (1.68 GB)\n",
|
||||
" 2. chemial_software.rar (0.05 GB)\n",
|
||||
" 3. 高性价比数据/Legancy 1740260 .zip (0.46 GB)\n",
|
||||
" 4. 高性价比数据/D009 Dec-2024/D009-1396k.zip (0.51 GB)\n",
|
||||
" 5. 高性价比数据/D011 Feb-2025/D011-576k.zip (0.21 GB)\n",
|
||||
" 6. 高性价比数据/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
|
||||
" 7. 高性价比数据/D111 Mar-2025/D111__439772.zip (0.18 GB)\n",
|
||||
" 8. 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip (0.64 GB)\n",
|
||||
" 9. part3-1400w/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
|
||||
" 10. part3-1400w/D015 jan-2023/D015.zip (0.31 GB)\n",
|
||||
" ... and 13 more files\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Find all compressed files and existing SDF files\n",
|
||||
"compressed_files = []\n",
|
||||
"existing_sdf_files = []\n",
|
||||
"\n",
|
||||
"# Define file extensions for compressed files\n",
|
||||
"compressed_extensions = {'.zip', '.rar', '.tar.gz', '.tgz', '.gz'}\n",
|
||||
"sdf_extensions = {'.sdf', '.mol', '.sd'}\n",
|
||||
"\n",
|
||||
"print(\"Searching for compressed files and SDF files...\")\n",
|
||||
"for file_path in data_dir.rglob('*'):\n",
|
||||
" if file_path.is_file():\n",
|
||||
" if file_path.suffix.lower() in compressed_extensions or \\\n",
|
||||
" ''.join(file_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
|
||||
" compressed_files.append(file_path)\n",
|
||||
" elif file_path.suffix.lower() in sdf_extensions:\n",
|
||||
" existing_sdf_files.append(file_path)\n",
|
||||
"\n",
|
||||
"print(f\"Found {len(compressed_files)} compressed files\")\n",
|
||||
"print(f\"Found {len(existing_sdf_files)} existing SDF files\")\n",
|
||||
"\n",
|
||||
"# Display compressed files\n",
|
||||
"print(\"\\nCompressed files:\")\n",
|
||||
"for i, file in enumerate(compressed_files[:10]): # Show first 10\n",
|
||||
" print(f\" {i+1}. {file.relative_to(data_dir)} ({file.stat().st_size / (1024**3):.2f} GB)\")\n",
|
||||
"if len(compressed_files) > 10:\n",
|
||||
" print(f\" ... and {len(compressed_files) - 10} more files\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_sdf_from_archive(archive_path, extract_to):\n",
|
||||
" \"\"\"Extract SDF files from various archive formats.\"\"\"\n",
|
||||
" extracted_files = []\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" if archive_path.suffix.lower() == '.zip':\n",
|
||||
" with zipfile.ZipFile(archive_path, 'r') as zip_ref:\n",
|
||||
" for file_info in zip_ref.filelist:\n",
|
||||
" if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
|
||||
" # Extract with full path structure preserved\n",
|
||||
" extracted_path = zip_ref.extract(file_info, extract_to)\n",
|
||||
" extracted_files.append(Path(extracted_path))\n",
|
||||
" \n",
|
||||
" elif archive_path.suffix.lower() == '.rar':\n",
|
||||
" with rarfile.RarFile(archive_path, 'r') as rar_ref:\n",
|
||||
" for file_info in rar_ref.infolist():\n",
|
||||
" if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
|
||||
" extracted_path = rar_ref.extract(file_info, extract_to)\n",
|
||||
" extracted_files.append(Path(extracted_path))\n",
|
||||
" \n",
|
||||
" elif archive_path.suffix.lower() in {'.gz', '.tgz'} or ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
|
||||
" if ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
|
||||
" with tarfile.open(archive_path, 'r:gz') as tar_ref:\n",
|
||||
" for member in tar_ref.getmembers():\n",
|
||||
" if member.name.lower().endswith(('.sdf', '.mol', '.sd')):\n",
|
||||
" extracted_path = tar_ref.extract(member, extract_to)\n",
|
||||
" extracted_files.append(Path(extracted_path))\n",
|
||||
" else:\n",
|
||||
" # Single gzip file\n",
|
||||
" output_path = extract_to / archive_path.stem\n",
|
||||
" if output_path.suffix.lower() in {'.sdf', '.mol', '.sd'}:\n",
|
||||
" with gzip.open(archive_path, 'rb') as gz_file:\n",
|
||||
" with open(output_path, 'wb') as out_file:\n",
|
||||
" shutil.copyfileobj(gz_file, out_file)\n",
|
||||
" extracted_files.append(output_path)\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error extracting {archive_path}: {e}\")\n",
|
||||
" \n",
|
||||
" return extracted_files"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting SDF files from compressed archives...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting archives: 4%|▍ | 1/23 [00:03<01:22, 3.76s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error extracting ../data/V1ePm5EdwFOnQFwMpOFJk.tar.gz: Compressed file ended before the end-of-stream marker was reached\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting archives: 61%|██████ | 14/23 [01:51<01:53, 12.64s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error extracting ../data/part3-1400w/D133 may-2023/D133.rar: Cannot find working tool\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Extracting archives: 100%|██████████| 23/23 [04:47<00:00, 12.49s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Error extracting ../data/part2-845w/D012 may-2023/D012-53344cpds.rar: Cannot find working tool\n",
|
||||
"\n",
|
||||
"Extracted 85 SDF files from archives\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Extract SDF files from all compressed archives\n",
|
||||
"extracted_sdf_files = []\n",
|
||||
"extraction_log = []\n",
|
||||
"\n",
|
||||
"print(\"Extracting SDF files from compressed archives...\")\n",
|
||||
"for archive_path in tqdm(compressed_files, desc=\"Extracting archives\"):\n",
|
||||
" # Create a subdirectory for each archive to maintain organization\n",
|
||||
" archive_extract_dir = output_dir / f\"extracted_{archive_path.stem}\"\n",
|
||||
" archive_extract_dir.mkdir(exist_ok=True)\n",
|
||||
" \n",
|
||||
" extracted = extract_sdf_from_archive(archive_path, archive_extract_dir)\n",
|
||||
" extracted_sdf_files.extend(extracted)\n",
|
||||
" \n",
|
||||
" extraction_log.append({\n",
|
||||
" 'archive': str(archive_path.relative_to(data_dir)),\n",
|
||||
" 'extracted_files': len(extracted),\n",
|
||||
" 'extract_dir': str(archive_extract_dir.relative_to(output_dir))\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"print(f\"\\nExtracted {len(extracted_sdf_files)} SDF files from archives\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Copying existing SDF files...\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Copying SDF files: 100%|██████████| 29/29 [01:50<00:00, 3.83s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Copied 29 existing SDF files\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Copy existing SDF files to the output directory\n",
|
||||
"copied_sdf_files = []\n",
|
||||
"\n",
|
||||
"print(\"Copying existing SDF files...\")\n",
|
||||
"for sdf_path in tqdm(existing_sdf_files, desc=\"Copying SDF files\"):\n",
|
||||
" # Create a subdirectory to maintain original path structure\n",
|
||||
" relative_path = sdf_path.relative_to(data_dir)\n",
|
||||
" destination = output_dir / \"existing\" / relative_path\n",
|
||||
" destination.parent.mkdir(parents=True, exist_ok=True)\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" shutil.copy2(sdf_path, destination)\n",
|
||||
" copied_sdf_files.append(destination)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error copying {sdf_path}: {e}\")\n",
|
||||
"\n",
|
||||
"print(f\"Copied {len(copied_sdf_files)} existing SDF files\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total SDF files available: 110\n",
|
||||
"Total size: 90.10 GB\n",
|
||||
"\n",
|
||||
"Extraction Summary:\n",
|
||||
" archive extracted_files \\\n",
|
||||
"0 V1ePm5EdwFOnQFwMpOFJk.tar.gz 0 \n",
|
||||
"1 chemial_software.rar 0 \n",
|
||||
"2 高性价比数据/Legancy 1740260 .zip 4 \n",
|
||||
"3 高性价比数据/D009 Dec-2024/D009-1396k.zip 3 \n",
|
||||
"4 高性价比数据/D011 Feb-2025/D011-576k.zip 2 \n",
|
||||
"5 高性价比数据/D013 Feb-2025/D013-294641.zip 1 \n",
|
||||
"6 高性价比数据/D111 Mar-2025/D111__439772.zip 3 \n",
|
||||
"7 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip 4 \n",
|
||||
"8 part3-1400w/D013 Feb-2025/D013-294641.zip 1 \n",
|
||||
"9 part3-1400w/D015 jan-2023/D015.zip 6 \n",
|
||||
"10 part3-1400w/D021 apr-2022/D021.zip 8 \n",
|
||||
"11 part3-1400w/D058 Mar-2025/D058-718205.zip 3 \n",
|
||||
"12 part3-1400w/D062 june-2024/D062.zip 4 \n",
|
||||
"13 part3-1400w/D111 Mar-2025/D111__439772.zip 3 \n",
|
||||
"14 part3-1400w/D133 may-2023/D133.rar 0 \n",
|
||||
"15 part3-1400w/D140 Mar-2025/D140-370312.zip 2 \n",
|
||||
"16 part3-1400w/D144 Oct-2023/D144-2023.zip 12 \n",
|
||||
"17 part3-1400w/D147 May-2024/D147-5736K.zip 12 \n",
|
||||
"18 part2-845w/D003 May-2025/D003-legacy 1738861 .zip 3 \n",
|
||||
"19 part2-845w/D003 May-2025/D003_4573K.zip 10 \n",
|
||||
"20 part2-845w/D010 Feb-2025/D010-428074.zip 3 \n",
|
||||
"21 part2-845w/D011 April-2025/D011-582106.zip 1 \n",
|
||||
"22 part2-845w/D012 may-2023/D012-53344cpds.rar 0 \n",
|
||||
"\n",
|
||||
" extract_dir \n",
|
||||
"0 extracted_V1ePm5EdwFOnQFwMpOFJk.tar \n",
|
||||
"1 extracted_chemial_software \n",
|
||||
"2 extracted_Legancy 1740260 \n",
|
||||
"3 extracted_D009-1396k \n",
|
||||
"4 extracted_D011-576k \n",
|
||||
"5 extracted_D013-294641 \n",
|
||||
"6 extracted_D111__439772 \n",
|
||||
"7 extracted_D001-1614k \n",
|
||||
"8 extracted_D013-294641 \n",
|
||||
"9 extracted_D015 \n",
|
||||
"10 extracted_D021 \n",
|
||||
"11 extracted_D058-718205 \n",
|
||||
"12 extracted_D062 \n",
|
||||
"13 extracted_D111__439772 \n",
|
||||
"14 extracted_D133 \n",
|
||||
"15 extracted_D140-370312 \n",
|
||||
"16 extracted_D144-2023 \n",
|
||||
"17 extracted_D147-5736K \n",
|
||||
"18 extracted_D003-legacy 1738861 \n",
|
||||
"19 extracted_D003_4573K \n",
|
||||
"20 extracted_D010-428074 \n",
|
||||
"21 extracted_D011-582106 \n",
|
||||
"22 extracted_D012-53344cpds \n",
|
||||
"\n",
|
||||
"Saved SDF file list to: ../extracted_sdf_files/sdf_file_list.csv\n",
|
||||
"\n",
|
||||
"Largest SDF files:\n",
|
||||
" relative_path size_mb\n",
|
||||
"65 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1921.503638\n",
|
||||
"81 existing/part3-1400w/D065 Mar-2025/D065-646891... 1830.950122\n",
|
||||
"64 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1796.318578\n",
|
||||
"66 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1793.656826\n",
|
||||
"45 extracted_D003-legacy 1738861 /D003_legacy_2_6... 1699.557550\n",
|
||||
"50 extracted_D003_4573K/D003_legacy_2_600000.sdf 1699.557550\n",
|
||||
"60 extracted_D011-582106/D011-582106.sdf 1646.037828\n",
|
||||
"44 extracted_D003-legacy 1738861 /D003_legacy_1_6... 1604.531812\n",
|
||||
"49 extracted_D003_4573K/D003_legacy_1_600000.sdf 1604.531812\n",
|
||||
"71 existing/part2-845w/D006 Sep-2025/D006-1697617... 1603.250914\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Create a comprehensive list of all SDF files\n",
|
||||
"all_sdf_files = list(output_dir.rglob('*.sdf')) + list(output_dir.rglob('*.mol')) + list(output_dir.rglob('*.sd'))\n",
|
||||
"\n",
|
||||
"print(f\"Total SDF files available: {len(all_sdf_files)}\")\n",
|
||||
"\n",
|
||||
"# Create summary statistics\n",
|
||||
"total_size = sum(f.stat().st_size for f in all_sdf_files)\n",
|
||||
"print(f\"Total size: {total_size / (1024**3):.2f} GB\")\n",
|
||||
"\n",
|
||||
"# Display extraction log\n",
|
||||
"extraction_df = pd.DataFrame(extraction_log)\n",
|
||||
"print(\"\\nExtraction Summary:\")\n",
|
||||
"print(extraction_df)\n",
|
||||
"\n",
|
||||
"# Save the list of all SDF files\n",
|
||||
"sdf_file_list = []\n",
|
||||
"for sdf_file in all_sdf_files:\n",
|
||||
" sdf_file_list.append({\n",
|
||||
" 'file_path': str(sdf_file),\n",
|
||||
" 'relative_path': str(sdf_file.relative_to(output_dir)),\n",
|
||||
" 'size_bytes': sdf_file.stat().st_size,\n",
|
||||
" 'size_mb': sdf_file.stat().st_size / (1024**2)\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
"sdf_df = pd.DataFrame(sdf_file_list)\n",
|
||||
"sdf_df.to_csv(output_dir / 'sdf_file_list.csv', index=False)\n",
|
||||
"print(f\"\\nSaved SDF file list to: {output_dir / 'sdf_file_list.csv'}\")\n",
|
||||
"\n",
|
||||
"# Display largest files\n",
|
||||
"print(\"\\nLargest SDF files:\")\n",
|
||||
"print(sdf_df.nlargest(10, 'size_mb')[['relative_path', 'size_mb']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Testing SDF file readability...\n",
|
||||
"✓ D003-4_l 460000.sdf: 460000 molecules\n",
|
||||
"✓ D003-5_l 460000.sdf: 460000 molecules\n",
|
||||
"✓ D003-6_l 460000.sdf: 460000 molecules\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[21:48:17] Explicit valence for atom # 20 B, 4, is greater than permitted\n",
|
||||
"[21:48:17] ERROR: Could not sanitize molecule ending on line 35871530\n",
|
||||
"[21:48:17] ERROR: Explicit valence for atom # 20 B, 4, is greater than permitted\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✓ D003-7_l 360260.sdf: 360259 molecules\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[21:49:06] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:06] ERROR: Could not sanitize molecule ending on line 7897106\n",
|
||||
"[21:49:06] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
|
||||
"[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 8 ignored\n",
|
||||
"[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
|
||||
"[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 14 ignored\n",
|
||||
"[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:49:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:49:14] ERROR: Could not sanitize molecule ending on line 11196260\n",
|
||||
"[21:49:14] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:49:20] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
|
||||
"[21:49:20] ERROR: Could not sanitize molecule ending on line 13936884\n",
|
||||
"[21:49:20] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
|
||||
"[21:49:23] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:23] ERROR: Could not sanitize molecule ending on line 15981200\n",
|
||||
"[21:49:23] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:27] Explicit valence for atom # 6 O, 3, is greater than permitted\n",
|
||||
"[21:49:27] ERROR: Could not sanitize molecule ending on line 18013023\n",
|
||||
"[21:49:27] ERROR: Explicit valence for atom # 6 O, 3, is greater than permitted\n",
|
||||
"[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:49:47] Explicit valence for atom # 7 N, 4, is greater than permitted\n",
|
||||
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30340115\n",
|
||||
"[21:49:47] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted\n",
|
||||
"[21:49:47] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
|
||||
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30341543\n",
|
||||
"[21:49:47] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
|
||||
"[21:49:47] Explicit valence for atom # 2 C, 5, is greater than permitted\n",
|
||||
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30419985\n",
|
||||
"[21:49:47] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted\n",
|
||||
"[21:49:53] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:53] ERROR: Could not sanitize molecule ending on line 33667204\n",
|
||||
"[21:49:53] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:54] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:54] ERROR: Could not sanitize molecule ending on line 34387487\n",
|
||||
"[21:49:54] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
|
||||
"[21:49:56] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
|
||||
"[21:49:56] ERROR: Could not sanitize molecule ending on line 35795575\n",
|
||||
"[21:49:56] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] Explicit valence for atom # 1 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36061075\n",
|
||||
"[21:49:57] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36061540\n",
|
||||
"[21:49:57] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
|
||||
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36064097\n",
|
||||
"[21:49:57] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
|
||||
"[21:50:05] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:50:05] ERROR: Could not sanitize molecule ending on line 41484117\n",
|
||||
"[21:50:05] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
|
||||
"[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
|
||||
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
|
||||
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43160926\n",
|
||||
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
|
||||
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
|
||||
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43161491\n",
|
||||
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
|
||||
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
|
||||
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43357350\n",
|
||||
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"✓ D009_part1_474930.sdf: 474912 molecules\n",
|
||||
"\n",
|
||||
"Extraction complete! Ready for substructure matching.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Verify a few SDF files can be read\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import SDMolSupplier\n",
|
||||
"\n",
|
||||
"print(\"Testing SDF file readability...\")\n",
|
||||
"test_files = all_sdf_files[:5] # Test first 5 files\n",
|
||||
"\n",
|
||||
"for sdf_file in test_files:\n",
|
||||
" try:\n",
|
||||
" suppl = SDMolSupplier(str(sdf_file))\n",
|
||||
" mols = [mol for mol in suppl if mol is not None]\n",
|
||||
" print(f\"✓ {sdf_file.name}: {len(mols)} molecules\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"✗ {sdf_file.name}: Error - {e}\")\n",
|
||||
"\n",
|
||||
"print(\"\\nExtraction complete! Ready for substructure matching.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "default",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
413
notebooks/02_rdkit_substructure_matching.ipynb
Normal file
413
notebooks/02_rdkit_substructure_matching.ipynb
Normal file
@@ -0,0 +1,413 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RDKit Substructure Matching with Multiprocessing\n",
|
||||
"\n",
|
||||
"This notebook performs parallel substructure matching on all extracted SDF files using SMARTS patterns with 220 processes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import SDMolSupplier, AllChem\n",
|
||||
"from joblib import Parallel, delayed\n",
|
||||
"import multiprocessing\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import time\n",
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"# Set up paths\n",
|
||||
"sdf_dir = Path('../extracted_sdf_files')\n",
|
||||
"results_dir = Path('../matching_results')\n",
|
||||
"results_dir.mkdir(exist_ok=True)\n",
|
||||
"\n",
|
||||
"print(f\"SDF directory: {sdf_dir}\")\n",
|
||||
"print(f\"Results directory: {results_dir}\")\n",
|
||||
"\n",
|
||||
"# Load the SDF file list\n",
|
||||
"sdf_df = pd.read_csv(sdf_dir / 'sdf_file_list.csv')\n",
|
||||
"print(f\"Found {len(sdf_df)} SDF files to process\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define SMARTS patterns for substructure matching\n",
|
||||
"smarts_patterns = {\n",
|
||||
" 'benzene_ring': 'c1ccccc1',\n",
|
||||
" 'pyridine': 'c1ccncc1', \n",
|
||||
" 'carboxylic_acid': 'C(=O)O',\n",
|
||||
" 'alcohol': '[OX2H]',\n",
|
||||
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
|
||||
" 'amide': 'C(=O)N',\n",
|
||||
" 'ester': 'C(=O)OC',\n",
|
||||
" 'ketone': 'C(=O)C',\n",
|
||||
" 'aldehyde': 'C(=O)H',\n",
|
||||
" 'nitro': '[N+](=O)[O-]',\n",
|
||||
" 'halogen': '[Cl,Br,F,I]',\n",
|
||||
" 'sulfonamide': 'S(=O)(=O)N',\n",
|
||||
" 'heterocycle': '[n,o,s]',\n",
|
||||
" 'aromatic_ring': '[a]',\n",
|
||||
" 'alkene': '[C]=[C]',\n",
|
||||
" 'alkyne': '[C]#[C]',\n",
|
||||
" 'ether': '[OD2]([C])[C]',\n",
|
||||
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Compile SMARTS patterns\n",
|
||||
"compiled_patterns = {}\n",
|
||||
"for name, smarts in smarts_patterns.items():\n",
|
||||
" try:\n",
|
||||
" pattern = Chem.MolFromSmarts(smarts)\n",
|
||||
" if pattern is not None:\n",
|
||||
" compiled_patterns[name] = pattern\n",
|
||||
" print(f\"✓ Compiled SMARTS for {name}: {smarts}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"✗ Failed to compile SMARTS for {name}: {smarts}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"✗ Error compiling SMARTS for {name}: {e}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def match_mol_to_patterns(mol, patterns_dict):\n",
|
||||
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
|
||||
" if mol is None:\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" matches = {}\n",
|
||||
" mol_smiles = Chem.MolToSmiles(mol) if mol.HasProp('_Name') == False else mol.GetProp('_Name')\n",
|
||||
" \n",
|
||||
" for pattern_name, pattern in patterns_dict.items():\n",
|
||||
" try:\n",
|
||||
" if mol.HasSubstructMatch(pattern):\n",
|
||||
" matches[pattern_name] = True\n",
|
||||
" else:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" except Exception as e:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" 'smiles': mol_smiles,\n",
|
||||
" 'matches': matches\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def process_sdf_file(sdf_path, patterns_dict, max_molecules=None):\n",
|
||||
" \"\"\"Process a single SDF file and return matching results.\"\"\"\n",
|
||||
" try:\n",
|
||||
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
||||
" molecules = [mol for mol in suppl if mol is not None]\n",
|
||||
" \n",
|
||||
" if max_molecules:\n",
|
||||
" molecules = molecules[:max_molecules]\n",
|
||||
" \n",
|
||||
" results = []\n",
|
||||
" for mol in molecules:\n",
|
||||
" result = match_mol_to_patterns(mol, patterns_dict)\n",
|
||||
" if result:\n",
|
||||
" result['sdf_file'] = str(sdf_path.relative_to(sdf_dir))\n",
|
||||
" results.append(result)\n",
|
||||
" \n",
|
||||
" return results\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error processing {sdf_path}: {e}\")\n",
|
||||
" return []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test with a small subset first\n",
|
||||
"print(\"Testing with a small subset of files...\")\n",
|
||||
"test_files = sdf_df.head(3)['file_path'].tolist()\n",
|
||||
"test_files = [Path(f) for f in test_files]\n",
|
||||
"\n",
|
||||
"start_time = time.time()\n",
|
||||
"test_results = []\n",
|
||||
"\n",
|
||||
"for sdf_file in test_files:\n",
|
||||
" print(f\"Processing {sdf_file.name}...\")\n",
|
||||
" results = process_sdf_file(sdf_file, compiled_patterns, max_molecules=100)\n",
|
||||
" test_results.extend(results)\n",
|
||||
" print(f\" Found {len(results)} molecules\")\n",
|
||||
"\n",
|
||||
"test_time = time.time() - start_time\n",
|
||||
"print(f\"\\nTest completed in {test_time:.2f} seconds\")\n",
|
||||
"print(f\"Processed {len(test_results)} molecules\")\n",
|
||||
"\n",
|
||||
"# Display some sample results\n",
|
||||
"if test_results:\n",
|
||||
" sample_result = test_results[0]\n",
|
||||
" print(f\"\\nSample result:\")\n",
|
||||
" print(f\" SMILES: {sample_result['smiles']}\")\n",
|
||||
" print(f\" Matches: {sample_result['matches']}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set up multiprocessing parameters\n",
|
||||
"N_PROCESSES = 220 # As requested\n",
|
||||
"N_FILES_PER_BATCH = 10 # Process files in batches to manage memory\n",
|
||||
"\n",
|
||||
"print(f\"Setting up multiprocessing with {N_PROCESSES} processes\")\n",
|
||||
"print(f\"CPU cores available: {multiprocessing.cpu_count()}\")\n",
|
||||
"\n",
|
||||
"# Prepare file batches\n",
|
||||
"all_files = [Path(f) for f in sdf_df['file_path'].tolist()]\n",
|
||||
"file_batches = [all_files[i:i + N_FILES_PER_BATCH] for i in range(0, len(all_files), N_FILES_PER_BATCH)]\n",
|
||||
"\n",
|
||||
"print(f\"Processing {len(all_files)} files in {len(file_batches)} batches\")\n",
|
||||
"print(f\"Average files per batch: {len(all_files) / len(file_batches):.1f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def process_file_batch(file_batch, patterns_dict, batch_id):\n",
|
||||
" \"\"\"Process a batch of SDF files.\"\"\"\n",
|
||||
" batch_results = []\n",
|
||||
" \n",
|
||||
" for sdf_file in file_batch:\n",
|
||||
" try:\n",
|
||||
" results = process_sdf_file(sdf_file, patterns_dict)\n",
|
||||
" batch_results.extend(results)\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error in batch {batch_id} processing {sdf_file}: {e}\")\n",
|
||||
" \n",
|
||||
" return batch_results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run parallel processing\n",
|
||||
"print(\"Starting parallel substructure matching...\")\n",
|
||||
"start_time = time.time()\n",
|
||||
"\n",
|
||||
"all_results = []\n",
|
||||
"processed_batches = 0\n",
|
||||
"\n",
|
||||
"# Process batches with progress bar\n",
|
||||
"for batch_results in tqdm(\n",
|
||||
" Parallel(n_jobs=N_PROCESSES, backend='loky', verbose=1)(\n",
|
||||
" delayed(process_file_batch)(batch, compiled_patterns, i) \n",
|
||||
" for i, batch in enumerate(file_batches)\n",
|
||||
" ),\n",
|
||||
" total=len(file_batches),\n",
|
||||
" desc=\"Processing batches\"\n",
|
||||
"):\n",
|
||||
" all_results.extend(batch_results)\n",
|
||||
" processed_batches += 1\n",
|
||||
" \n",
|
||||
" # Save intermediate results every 10 batches\n",
|
||||
" if processed_batches % 10 == 0:\n",
|
||||
" intermediate_df = pd.DataFrame(all_results)\n",
|
||||
" intermediate_file = results_dir / f'intermediate_results_batch_{processed_batches}.csv'\n",
|
||||
" intermediate_df.to_csv(intermediate_file, index=False)\n",
|
||||
" print(f\"Saved intermediate results: {len(all_results)} molecules processed\")\n",
|
||||
"\n",
|
||||
"total_time = time.time() - start_time\n",
|
||||
"print(f\"\\nProcessing completed in {total_time:.2f} seconds\")\n",
|
||||
"print(f\"Total molecules processed: {len(all_results)}\")\n",
|
||||
"print(f\"Average processing speed: {len(all_results) / total_time:.1f} molecules/second\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Convert results to DataFrame and save\n",
|
||||
"if all_results:\n",
|
||||
" # Flatten the results\n",
|
||||
" flattened_results = []\n",
|
||||
" for result in all_results:\n",
|
||||
" row = {\n",
|
||||
" 'smiles': result['smiles'],\n",
|
||||
" 'sdf_file': result['sdf_file']\n",
|
||||
" }\n",
|
||||
" row.update(result['matches'])\n",
|
||||
" flattened_results.append(row)\n",
|
||||
" \n",
|
||||
" results_df = pd.DataFrame(flattened_results)\n",
|
||||
" \n",
|
||||
" # Save complete results\n",
|
||||
" results_file = results_dir / 'complete_matching_results.csv'\n",
|
||||
" results_df.to_csv(results_file, index=False)\n",
|
||||
" print(f\"Saved complete results to: {results_file}\")\n",
|
||||
" \n",
|
||||
" # Generate summary statistics\n",
|
||||
" summary_stats = {}\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" count = results_df[pattern_name].sum()\n",
|
||||
" percentage = (count / len(results_df)) * 100\n",
|
||||
" summary_stats[pattern_name] = {\n",
|
||||
" 'count': int(count),\n",
|
||||
" 'percentage': percentage\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" summary_df = pd.DataFrame(summary_stats).T\n",
|
||||
" summary_file = results_dir / 'matching_summary.csv'\n",
|
||||
" summary_df.to_csv(summary_file)\n",
|
||||
" \n",
|
||||
" print(f\"\\nMatching Summary:\")\n",
|
||||
" print(summary_df)\n",
|
||||
" \n",
|
||||
" # Save molecules with specific patterns\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" matching_mols = results_df[results_df[pattern_name] == True]\n",
|
||||
" if len(matching_mols) > 0:\n",
|
||||
" pattern_file = results_dir / f'molecules_with_{pattern_name}.csv'\n",
|
||||
" matching_mols.to_csv(pattern_file, index=False)\n",
|
||||
" print(f\"Saved {len(matching_mols)} molecules with {pattern_name} pattern\")\n",
|
||||
"else:\n",
|
||||
" print(\"No results to save\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Performance analysis\n",
|
||||
"print(\"\\nPerformance Analysis:\")\n",
|
||||
"print(f\"Total processing time: {total_time:.2f} seconds\")\n",
|
||||
"print(f\"Total molecules processed: {len(all_results)}\")\n",
|
||||
"print(f\"Number of processes used: {N_PROCESSES}\")\n",
|
||||
"print(f\"Average molecules per second: {len(all_results) / total_time:.1f}\")\n",
|
||||
"print(f\"Average molecules per process: {len(all_results) / N_PROCESSES:.1f}\")\n",
|
||||
"\n",
|
||||
"# File processing statistics\n",
|
||||
"files_processed = len(set([r['sdf_file'] for r in all_results]))\n",
|
||||
"print(f\"Files successfully processed: {files_processed}/{len(all_files)}\")\n",
|
||||
"print(f\"Average molecules per file: {len(all_results) / files_processed:.1f}\")\n",
|
||||
"\n",
|
||||
"# Memory usage estimation\n",
|
||||
"if all_results:\n",
|
||||
" import sys\n",
|
||||
" result_size = sys.getsizeof(all_results) / (1024**2)\n",
|
||||
" print(f\"Estimated memory usage for results: {result_size:.2f} MB\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a quick visualization of pattern frequencies\n",
|
||||
"try:\n",
|
||||
" import matplotlib.pyplot as plt\n",
|
||||
" import seaborn as sns\n",
|
||||
" \n",
|
||||
" plt.figure(figsize=(12, 8))\n",
|
||||
" pattern_counts = [results_df[pattern].sum() for pattern in compiled_patterns.keys()]\n",
|
||||
" pattern_names = list(compiled_patterns.keys())\n",
|
||||
" \n",
|
||||
" # Create bar plot\n",
|
||||
" plt.bar(range(len(pattern_names)), pattern_counts)\n",
|
||||
" plt.xticks(range(len(pattern_names)), pattern_names, rotation=45, ha='right')\n",
|
||||
" plt.ylabel('Number of Molecules')\n",
|
||||
" plt.title('Substructure Pattern Frequencies')\n",
|
||||
" plt.tight_layout()\n",
|
||||
" \n",
|
||||
" # Save plot\n",
|
||||
" plot_file = results_dir / 'pattern_frequencies.png'\n",
|
||||
" plt.savefig(plot_file, dpi=300, bbox_inches='tight')\n",
|
||||
" plt.show()\n",
|
||||
" \n",
|
||||
" print(f\"Saved visualization to: {plot_file}\")\n",
|
||||
" \n",
|
||||
"except ImportError:\n",
|
||||
" print(\"Matplotlib not available for visualization\")\n",
|
||||
" print(\"Pattern counts:\")\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" count = results_df[pattern_name].sum()\n",
|
||||
" print(f\" {pattern_name}: {int(count)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"This notebook successfully:\n",
|
||||
"1. ✅ Set up RDKit and compiled SMARTS patterns\n",
|
||||
"2. ✅ Implemented parallel processing with 220 processes\n",
|
||||
"3. ✅ Processed all SDF files for substructure matching\n",
|
||||
"4. ✅ Generated comprehensive results and statistics\n",
|
||||
"5. ✅ Saved results in multiple formats for further analysis\n",
|
||||
"\n",
|
||||
"The results are saved in the `matching_results` directory with:\n",
|
||||
"- Complete matching results CSV\n",
|
||||
"- Summary statistics\n",
|
||||
"- Individual pattern matches\n",
|
||||
"- Performance analysis\n",
|
||||
"- Visualization (if matplotlib available)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "default",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
415
notebooks/03_cpu_detection_joblib_matching.ipynb
Normal file
415
notebooks/03_cpu_detection_joblib_matching.ipynb
Normal file
@@ -0,0 +1,415 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# CPU Detection and Joblib Parallel SMARTS Matching\n",
|
||||
"\n",
|
||||
"This notebook:\n",
|
||||
"1. Detects available CPU cores and uses 80% of them\n",
|
||||
"2. Uses joblib for parallel RDKit SMARTS matching\n",
|
||||
"3. Tests with a single SDF file from extracted_sdf_files directory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"import multiprocessing\n",
|
||||
"import psutil\n",
|
||||
"import time\n",
|
||||
"import warnings\n",
|
||||
"from pathlib import Path\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import SDMolSupplier\n",
|
||||
"from joblib import Parallel, delayed\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"print(\"=== CPU Detection and System Information ===\")\n",
|
||||
"\n",
|
||||
"# Get basic CPU information\n",
|
||||
"total_cores = multiprocessing.cpu_count()\n",
|
||||
"print(f\"Total CPU cores available: {total_cores}\")\n",
|
||||
"\n",
|
||||
"# Get physical vs logical cores using psutil\n",
|
||||
"physical_cores = psutil.cpu_count(logical=False)\n",
|
||||
"logical_cores = psutil.cpu_count(logical=True)\n",
|
||||
"print(f\"Physical cores: {physical_cores}\")\n",
|
||||
"print(f\"Logical cores: {logical_cores}\")\n",
|
||||
"\n",
|
||||
"# Calculate 80% of available cores\n",
|
||||
"target_cores = int(total_cores * 0.8)\n",
|
||||
"print(f\"\\nUsing 80% of CPU cores: {target_cores} cores\")\n",
|
||||
"print(f\"Utilization: {(target_cores/total_cores)*100:.1f}%\")\n",
|
||||
"\n",
|
||||
"# Get current CPU usage\n",
|
||||
"cpu_percent = psutil.cpu_percent(interval=1)\n",
|
||||
"print(f\"Current CPU usage: {cpu_percent:.1f}%\")\n",
|
||||
"\n",
|
||||
"# Get memory information\n",
|
||||
"memory = psutil.virtual_memory()\n",
|
||||
"print(f\"\\nMemory Information:\")\n",
|
||||
"print(f\"Total memory: {memory.total / (1024**3):.2f} GB\")\n",
|
||||
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
|
||||
"print(f\"Memory usage: {memory.percent:.1f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set up paths\n",
|
||||
"project_root = Path('..').resolve()\n",
|
||||
"extracted_sdf_dir = project_root / 'extracted_sdf_files'\n",
|
||||
"\n",
|
||||
"print(f\"Project root: {project_root}\")\n",
|
||||
"print(f\"Extracted SDF directory: {extracted_sdf_dir}\")\n",
|
||||
"\n",
|
||||
"# Find all SDF files in extracted_sdf_files\n",
|
||||
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
|
||||
"print(f\"\\nFound {len(sdf_files)} SDF files in extracted_sdf_files directory\")\n",
|
||||
"\n",
|
||||
"# Display first few files\n",
|
||||
"if sdf_files:\n",
|
||||
" print(\"\\nFirst 5 SDF files:\")\n",
|
||||
" for i, sdf_file in enumerate(sdf_files[:5]):\n",
|
||||
" file_size = sdf_file.stat().st_size / (1024**2) # Size in MB\n",
|
||||
" print(f\" {i+1}. {sdf_file.relative_to(project_root)} ({file_size:.2f} MB)\")\n",
|
||||
" \n",
|
||||
" # Select the first file for testing\n",
|
||||
" test_sdf_file = sdf_files[0]\n",
|
||||
" print(f\"\\nSelected test file: {test_sdf_file.relative_to(project_root)}\")\n",
|
||||
"else:\n",
|
||||
" print(\"No SDF files found in extracted_sdf_files directory!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define SMARTS patterns for substructure matching\n",
|
||||
"smarts_patterns = {\n",
|
||||
" 'benzene_ring': 'c1ccccc1',\n",
|
||||
" 'pyridine': 'c1ccncc1', \n",
|
||||
" 'carboxylic_acid': 'C(=O)O',\n",
|
||||
" 'alcohol': '[OX2H]',\n",
|
||||
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
|
||||
" 'amide': 'C(=O)N',\n",
|
||||
" 'ester': 'C(=O)OC',\n",
|
||||
" 'ketone': 'C(=O)C',\n",
|
||||
" 'aldehyde': 'C(=O)H',\n",
|
||||
" 'nitro': '[N+](=O)[O-]',\n",
|
||||
" 'halogen': '[Cl,Br,F,I]',\n",
|
||||
" 'sulfonamide': 'S(=O)(=O)N',\n",
|
||||
" 'heterocycle': '[n,o,s]',\n",
|
||||
" 'aromatic_ring': '[a]',\n",
|
||||
" 'alkene': '[C]=[C]',\n",
|
||||
" 'alkyne': '[C]#[C]',\n",
|
||||
" 'ether': '[OD2]([C])[C]',\n",
|
||||
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Compile SMARTS patterns\n",
|
||||
"compiled_patterns = {}\n",
|
||||
"print(\"Compiling SMARTS patterns...\")\n",
|
||||
"for name, smarts in smarts_patterns.items():\n",
|
||||
" try:\n",
|
||||
" pattern = Chem.MolFromSmarts(smarts)\n",
|
||||
" if pattern is not None:\n",
|
||||
" compiled_patterns[name] = pattern\n",
|
||||
" print(f\"✓ {name}: {smarts}\")\n",
|
||||
" else:\n",
|
||||
" print(f\"✗ Failed to compile {name}: {smarts}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"✗ Error compiling {name}: {e}\")\n",
|
||||
"\n",
|
||||
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_molecules_from_sdf(sdf_path, max_molecules=None):\n",
|
||||
" \"\"\"Load molecules from SDF file.\"\"\"\n",
|
||||
" print(f\"Loading molecules from {sdf_path.name}...\")\n",
|
||||
" \n",
|
||||
" try:\n",
|
||||
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
||||
" molecules = []\n",
|
||||
" \n",
|
||||
" for i, mol in enumerate(suppl):\n",
|
||||
" if mol is not None:\n",
|
||||
" molecules.append(mol)\n",
|
||||
" \n",
|
||||
" if max_molecules and len(molecules) >= max_molecules:\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" print(f\"Successfully loaded {len(molecules)} valid molecules\")\n",
|
||||
" return molecules\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error loading molecules: {e}\")\n",
|
||||
" return []\n",
|
||||
"\n",
|
||||
"def match_single_molecule(mol, patterns_dict, mol_id):\n",
|
||||
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
|
||||
" if mol is None:\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" matches = {}\n",
|
||||
" mol_smiles = Chem.MolToSmiles(mol)\n",
|
||||
" \n",
|
||||
" for pattern_name, pattern in patterns_dict.items():\n",
|
||||
" try:\n",
|
||||
" if mol.HasSubstructMatch(pattern):\n",
|
||||
" matches[pattern_name] = True\n",
|
||||
" else:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" except Exception as e:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" \n",
|
||||
" return {\n",
|
||||
" 'mol_id': mol_id,\n",
|
||||
" 'smiles': mol_smiles,\n",
|
||||
" 'matches': matches\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
"print(\"Helper functions defined successfully\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load test molecules\n",
|
||||
"if 'test_sdf_file' in locals():\n",
|
||||
" # Load a subset for testing (adjust as needed)\n",
|
||||
" max_test_molecules = 1000 # Adjust this number based on your needs\n",
|
||||
" molecules = load_molecules_from_sdf(test_sdf_file, max_molecules=max_test_molecules)\n",
|
||||
" \n",
|
||||
" if molecules:\n",
|
||||
" print(f\"\\nLoaded {len(molecules)} molecules for testing\")\n",
|
||||
" print(f\"Sample molecule SMILES: {Chem.MolToSmiles(molecules[0])}\")\n",
|
||||
" else:\n",
|
||||
" print(\"No molecules loaded!\")\n",
|
||||
"else:\n",
|
||||
" print(\"No test SDF file selected!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Test sequential processing first\n",
|
||||
"if 'molecules' in locals() and molecules:\n",
|
||||
" print(\"=== Sequential Processing Test ===\")\n",
|
||||
" \n",
|
||||
" # Test with first 10 molecules\n",
|
||||
" test_mols = molecules[:10]\n",
|
||||
" \n",
|
||||
" start_time = time.time()\n",
|
||||
" sequential_results = []\n",
|
||||
" \n",
|
||||
" for i, mol in enumerate(test_mols):\n",
|
||||
" result = match_single_molecule(mol, compiled_patterns, i)\n",
|
||||
" if result:\n",
|
||||
" sequential_results.append(result)\n",
|
||||
" \n",
|
||||
" sequential_time = time.time() - start_time\n",
|
||||
" \n",
|
||||
" print(f\"Sequential processing completed in {sequential_time:.3f} seconds\")\n",
|
||||
" print(f\"Processed {len(sequential_results)} molecules\")\n",
|
||||
" print(f\"Average time per molecule: {sequential_time/len(sequential_results):.3f} seconds\")\n",
|
||||
" \n",
|
||||
" # Display sample result\n",
|
||||
" if sequential_results:\n",
|
||||
" sample = sequential_results[0]\n",
|
||||
" print(f\"\\nSample result:\")\n",
|
||||
" print(f\" Molecule ID: {sample['mol_id']}\")\n",
|
||||
" print(f\" SMILES: {sample['smiles']}\")\n",
|
||||
" print(f\" Matches: {sample['matches']}\")\n",
|
||||
"else:\n",
|
||||
" print(\"No molecules available for testing!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Parallel processing with joblib using 80% CPU\n",
|
||||
"if 'molecules' in locals() and molecules:\n",
|
||||
" print(f\"=== Parallel Processing with {target_cores} cores (80% CPU) ===\")\n",
|
||||
" \n",
|
||||
" # Use all loaded molecules or a subset for testing\n",
|
||||
" parallel_test_mols = molecules[:100] # Adjust this number as needed\n",
|
||||
" print(f\"Processing {len(parallel_test_mols)} molecules in parallel...\")\n",
|
||||
" \n",
|
||||
" start_time = time.time()\n",
|
||||
" \n",
|
||||
" # Run parallel processing\n",
|
||||
" parallel_results = Parallel(\n",
|
||||
" n_jobs=target_cores,\n",
|
||||
" backend='loky',\n",
|
||||
" verbose=1\n",
|
||||
" )(\n",
|
||||
" delayed(match_single_molecule)(mol, compiled_patterns, i) \n",
|
||||
" for i, mol in enumerate(parallel_test_mols)\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" # Filter out None results\n",
|
||||
" parallel_results = [r for r in parallel_results if r is not None]\n",
|
||||
" \n",
|
||||
" parallel_time = time.time() - start_time\n",
|
||||
" \n",
|
||||
" print(f\"\\nParallel processing completed in {parallel_time:.3f} seconds\")\n",
|
||||
" print(f\"Successfully processed {len(parallel_results)} molecules\")\n",
|
||||
" print(f\"Average time per molecule: {parallel_time/len(parallel_results):.3f} seconds\")\n",
|
||||
" print(f\"Processing speed: {len(parallel_results)/parallel_time:.1f} molecules/second\")\n",
|
||||
" \n",
|
||||
" # Calculate speedup\n",
|
||||
" if 'sequential_time' in locals():\n",
|
||||
" speedup = sequential_time / parallel_time\n",
|
||||
" efficiency = (speedup / target_cores) * 100\n",
|
||||
" print(f\"\\nPerformance Analysis:\")\n",
|
||||
" print(f\"Speedup: {speedup:.2f}x\")\n",
|
||||
" print(f\"Parallel efficiency: {efficiency:.1f}%\")\n",
|
||||
" \n",
|
||||
"else:\n",
|
||||
" print(\"No molecules available for parallel processing!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Convert results to DataFrame and analyze\n",
|
||||
"if 'parallel_results' in locals() and parallel_results:\n",
|
||||
" print(\"=== Results Analysis ===\")\n",
|
||||
" \n",
|
||||
" # Flatten results for DataFrame\n",
|
||||
" flattened_results = []\n",
|
||||
" for result in parallel_results:\n",
|
||||
" row = {\n",
|
||||
" 'mol_id': result['mol_id'],\n",
|
||||
" 'smiles': result['smiles']\n",
|
||||
" }\n",
|
||||
" row.update(result['matches'])\n",
|
||||
" flattened_results.append(row)\n",
|
||||
" \n",
|
||||
" results_df = pd.DataFrame(flattened_results)\n",
|
||||
" \n",
|
||||
" print(f\"Results DataFrame shape: {results_df.shape}\")\n",
|
||||
" print(f\"\\nColumns: {list(results_df.columns)}\")\n",
|
||||
" \n",
|
||||
" # Pattern matching statistics\n",
|
||||
" print(f\"\\nPattern Matching Statistics:\")\n",
|
||||
" for pattern_name in compiled_patterns.keys():\n",
|
||||
" count = results_df[pattern_name].sum()\n",
|
||||
" percentage = (count / len(results_df)) * 100\n",
|
||||
" print(f\" {pattern_name}: {count} molecules ({percentage:.1f}%)\")\n",
|
||||
" \n",
|
||||
" # Display some sample results\n",
|
||||
" print(f\"\\nSample results (first 5 molecules):\")\n",
|
||||
" print(results_df[['mol_id', 'smiles'] + list(compiled_patterns.keys())].head())\n",
|
||||
" \n",
|
||||
" # Save results\n",
|
||||
" results_dir = Path('../results')\n",
|
||||
" results_dir.mkdir(exist_ok=True)\n",
|
||||
" \n",
|
||||
" results_file = results_dir / f'parallel_matching_results_{len(parallel_results)}mols.csv'\n",
|
||||
" results_df.to_csv(results_file, index=False)\n",
|
||||
" print(f\"\\nResults saved to: {results_file}\")\n",
|
||||
" \n",
|
||||
"else:\n",
|
||||
" print(\"No results to analyze!\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Monitor CPU usage during processing\n",
|
||||
"print(\"=== CPU Usage Monitoring ===\")\n",
|
||||
"print(f\"Current CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
|
||||
"print(f\"CPU usage per core: {psutil.cpu_percent(percpu=True)}\")\n",
|
||||
"\n",
|
||||
"# Memory usage\n",
|
||||
"memory = psutil.virtual_memory()\n",
|
||||
"print(f\"\\nMemory usage: {memory.percent:.1f}%\")\n",
|
||||
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
|
||||
"\n",
|
||||
"# Process information\n",
|
||||
"current_process = psutil.Process()\n",
|
||||
"print(f\"\\nCurrent process memory usage: {current_process.memory_info().rss / (1024**2):.2f} MB\")\n",
|
||||
"print(f\"Current process CPU usage: {current_process.cpu_percent():.1f}%\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Summary\n",
|
||||
"\n",
|
||||
"This notebook successfully:\n",
|
||||
"1. ✅ Detected available CPU cores and calculated 80% usage\n",
|
||||
"2. ✅ Used joblib for parallel RDKit SMARTS matching\n",
|
||||
"3. ✅ Tested with a real SDF file from extracted_sdf_files directory\n",
|
||||
"4. ✅ Compared sequential vs parallel performance\n",
|
||||
"5. ✅ Monitored system resources during processing\n",
|
||||
"\n",
|
||||
"### Key Results:\n",
|
||||
"- **Total CPU cores**: {total_cores}\n",
|
||||
"- **Used cores (80%)**: {target_cores}\n",
|
||||
"- **Parallel efficiency**: Calculated based on speedup\n",
|
||||
"- **Processing speed**: Molecules per second\n",
|
||||
"\n",
|
||||
"The results are saved in the `results` directory for further analysis."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "default",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.14.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
430
notebooks/04_rdkit_parallel_analysis.ipynb
Normal file
430
notebooks/04_rdkit_parallel_analysis.ipynb
Normal file
@@ -0,0 +1,430 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# RDKit Parallel Processing Analysis\n",
|
||||
"\n",
|
||||
"分析RDKit匹配场景的并行性和SDF读取性能瓶颈"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import multiprocessing\n",
|
||||
"import psutil\n",
|
||||
"from pathlib import Path\n",
|
||||
"from rdkit import Chem\n",
|
||||
"from rdkit.Chem import SDMolSupplier\n",
|
||||
"from joblib import Parallel, delayed\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"print(\"=== RDKit并行处理分析 ===\")\n",
|
||||
"print(f\"可用CPU核心数: {multiprocessing.cpu_count()}\")\n",
|
||||
"print(f\"使用80%核心: {int(multiprocessing.cpu_count() * 0.8)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. SDF读取性能测试"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test_sdf_loading_performance(sdf_path, molecule_counts=[100, 500, 1000, 5000, 10000]):\n",
|
||||
" \"\"\"测试不同分子数量下的SDF加载性能\"\"\"\n",
|
||||
" \n",
|
||||
" loading_times = {}\n",
|
||||
" memory_usage = {}\n",
|
||||
" \n",
|
||||
" print(f\"测试文件: {sdf_path.name}\")\n",
|
||||
" print(\"分子数量\\t加载时间(s)\\t内存使用(MB)\\t速度(mol/s)\")\n",
|
||||
" print(\"-\" * 60)\n",
|
||||
" \n",
|
||||
" for count in molecule_counts:\n",
|
||||
" # 记录开始时间和内存\n",
|
||||
" start_time = time.time()\n",
|
||||
" process = psutil.Process()\n",
|
||||
" start_memory = process.memory_info().rss / (1024**2)\n",
|
||||
" \n",
|
||||
" # 加载分子\n",
|
||||
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
|
||||
" molecules = []\n",
|
||||
" \n",
|
||||
" for i, mol in enumerate(suppl):\n",
|
||||
" if mol is not None:\n",
|
||||
" molecules.append(mol)\n",
|
||||
" if len(molecules) >= count:\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" # 记录结束时间和内存\n",
|
||||
" end_time = time.time()\n",
|
||||
" end_memory = process.memory_info().rss / (1024**2)\n",
|
||||
" \n",
|
||||
" loading_time = end_time - start_time\n",
|
||||
" memory_used = end_memory - start_memory\n",
|
||||
" speed = len(molecules) / loading_time if loading_time > 0 else 0\n",
|
||||
" \n",
|
||||
" loading_times[count] = loading_time\n",
|
||||
" memory_usage[count] = memory_used\n",
|
||||
" \n",
|
||||
" print(f\"{count}\\t\\t{loading_time:.3f}\\t\\t{memory_used:.1f}\\t\\t{speed:.1f}\")\n",
|
||||
" \n",
|
||||
" return loading_times, memory_usage\n",
|
||||
"\n",
|
||||
"# 找一个测试文件\n",
|
||||
"extracted_sdf_dir = Path('../extracted_sdf_files')\n",
|
||||
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
|
||||
"\n",
|
||||
"if sdf_files:\n",
|
||||
" test_file = sdf_files[0]\n",
|
||||
" loading_times, memory_usage = test_sdf_loading_performance(test_file)\n",
|
||||
"else:\n",
|
||||
" print(\"未找到SDF文件\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. RDKit匹配性能测试"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def test_matching_performance(molecules, patterns_dict, molecule_counts=[100, 500, 1000, 5000]):\n",
|
||||
" \"\"\"测试不同分子数量下的匹配性能\"\"\"\n",
|
||||
" \n",
|
||||
" def match_single_molecule(mol, patterns):\n",
|
||||
" matches = {}\n",
|
||||
" for pattern_name, pattern in patterns.items():\n",
|
||||
" try:\n",
|
||||
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
|
||||
" except:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" return matches\n",
|
||||
" \n",
|
||||
" matching_times = {}\n",
|
||||
" \n",
|
||||
" print(\"分子数量\\t匹配时间(s)\\t速度(mol/s)\\t平均时间/分子(ms)\")\n",
|
||||
" print(\"-\" * 60)\n",
|
||||
" \n",
|
||||
" for count in molecule_counts:\n",
|
||||
" if count > len(molecules):\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" test_mols = molecules[:count]\n",
|
||||
" \n",
|
||||
" start_time = time.time()\n",
|
||||
" \n",
|
||||
" for mol in test_mols:\n",
|
||||
" match_single_molecule(mol, patterns_dict)\n",
|
||||
" \n",
|
||||
" end_time = time.time()\n",
|
||||
" matching_time = end_time - start_time\n",
|
||||
" speed = count / matching_time if matching_time > 0 else 0\n",
|
||||
" avg_time_per_mol = (matching_time / count) * 1000 # 转换为毫秒\n",
|
||||
" \n",
|
||||
" matching_times[count] = matching_time\n",
|
||||
" \n",
|
||||
" print(f\"{count}\\t\\t{matching_time:.3f}\\t\\t{speed:.1f}\\t\\t{avg_time_per_mol:.2f}\")\n",
|
||||
" \n",
|
||||
" return matching_times\n",
|
||||
"\n",
|
||||
"# 测试匹配性能\n",
|
||||
"if 'test_file' in locals():\n",
|
||||
" # 加载一些分子用于测试\n",
|
||||
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
|
||||
" test_molecules = [mol for mol in suppl if mol is not None][:10000]\n",
|
||||
" \n",
|
||||
" # 定义测试SMARTS模式\n",
|
||||
" test_patterns = {\n",
|
||||
" 'benzene': Chem.MolFromSmarts('c1ccccc1'),\n",
|
||||
" 'alcohol': Chem.MolFromSmarts('[OX2H]'),\n",
|
||||
" 'carboxylic_acid': Chem.MolFromSmarts('C(=O)O'),\n",
|
||||
" 'amine': Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]')\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" print(f\"\\n=== 匹配性能测试 (使用{len(test_patterns)}个SMARTS模式) ===\")\n",
|
||||
" matching_times = test_matching_performance(test_molecules, test_patterns)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. 并行vs串行性能对比"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compare_parallel_vs_serial(molecules, patterns_dict, n_jobs_list=[1, 2, 4, 8, 16]):\n",
|
||||
" \"\"\"比较不同并行度下的性能\"\"\"\n",
|
||||
" \n",
|
||||
" def match_single_molecule(mol, patterns):\n",
|
||||
" matches = {}\n",
|
||||
" for pattern_name, pattern in patterns.items():\n",
|
||||
" try:\n",
|
||||
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
|
||||
" except:\n",
|
||||
" matches[pattern_name] = False\n",
|
||||
" return matches\n",
|
||||
" \n",
|
||||
" test_mols = molecules[:1000] # 使用1000个分子测试\n",
|
||||
" results = {}\n",
|
||||
" \n",
|
||||
" print(f\"测试分子数量: {len(test_mols)}\")\n",
|
||||
" print(f\"SMARTS模式数量: {len(patterns_dict)}\")\n",
|
||||
" print(\"\\n并行度\\t时间(s)\\t速度(mol/s)\\t加速比\\t效率(%)\")\n",
|
||||
" print(\"-\" * 65)\n",
|
||||
" \n",
|
||||
" serial_time = None\n",
|
||||
" \n",
|
||||
" for n_jobs in n_jobs_list:\n",
|
||||
" start_time = time.time()\n",
|
||||
" \n",
|
||||
" if n_jobs == 1:\n",
|
||||
" # 串行处理\n",
|
||||
" for mol in test_mols:\n",
|
||||
" match_single_molecule(mol, patterns_dict)\n",
|
||||
" else:\n",
|
||||
" # 并行处理\n",
|
||||
" Parallel(n_jobs=n_jobs, backend='loky')(\n",
|
||||
" delayed(match_single_molecule)(mol, patterns_dict) \n",
|
||||
" for mol in test_mols\n",
|
||||
" )\n",
|
||||
" \n",
|
||||
" end_time = time.time()\n",
|
||||
" processing_time = end_time - start_time\n",
|
||||
" speed = len(test_mols) / processing_time\n",
|
||||
" \n",
|
||||
" if serial_time is None:\n",
|
||||
" serial_time = processing_time\n",
|
||||
" speedup = 1.0\n",
|
||||
" efficiency = 100.0\n",
|
||||
" else:\n",
|
||||
" speedup = serial_time / processing_time\n",
|
||||
" efficiency = (speedup / n_jobs) * 100\n",
|
||||
" \n",
|
||||
" results[n_jobs] = {\n",
|
||||
" 'time': processing_time,\n",
|
||||
" 'speed': speed,\n",
|
||||
" 'speedup': speedup,\n",
|
||||
" 'efficiency': efficiency\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" print(f\"{n_jobs}\\t\\t{processing_time:.3f}\\t\\t{speed:.1f}\\t\\t{speedup:.2f}x\\t{efficiency:.1f}\")\n",
|
||||
" \n",
|
||||
" return results\n",
|
||||
"\n",
|
||||
"# 运行并行vs串行对比\n",
|
||||
"if 'test_molecules' in locals() and 'test_patterns' in locals():\n",
|
||||
" print(\"\\n=== 并行vs串行性能对比 ===\")\n",
|
||||
" max_jobs = min(16, int(multiprocessing.cpu_count() * 0.8))\n",
|
||||
" n_jobs_list = [1, 2, 4, 8, max_jobs] if max_jobs > 8 else [1, 2, 4, max_jobs]\n",
|
||||
" \n",
|
||||
" parallel_results = compare_parallel_vs_serial(test_molecules, test_patterns, n_jobs_list)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. I/O瓶颈分析"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def analyze_io_bottleneck():\n",
|
||||
" \"\"\"分析I/O瓶颈\"\"\"\n",
|
||||
" \n",
|
||||
" print(\"=== I/O瓶颈分析 ===\")\n",
|
||||
" \n",
|
||||
" # 测试文件读取速度\n",
|
||||
" if 'test_file' in locals():\n",
|
||||
" file_size = test_file.stat().st_size / (1024**2) # MB\n",
|
||||
" \n",
|
||||
" print(f\"\\n文件信息:\")\n",
|
||||
" print(f\"文件大小: {file_size:.2f} MB\")\n",
|
||||
" print(f\"文件路径: {test_file}\")\n",
|
||||
" \n",
|
||||
" # 测试纯文件读取速度\n",
|
||||
" start_time = time.time()\n",
|
||||
" with open(test_file, 'rb') as f:\n",
|
||||
" raw_data = f.read()\n",
|
||||
" raw_read_time = time.time() - start_time\n",
|
||||
" raw_read_speed = file_size / raw_read_time\n",
|
||||
" \n",
|
||||
" print(f\"\\n纯文件读取:\")\n",
|
||||
" print(f\"读取时间: {raw_read_time:.3f} s\")\n",
|
||||
" print(f\"读取速度: {raw_read_speed:.2f} MB/s\")\n",
|
||||
" \n",
|
||||
" # 测试RDKit解析速度\n",
|
||||
" start_time = time.time()\n",
|
||||
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
|
||||
" molecules = [mol for mol in suppl if mol is not None]\n",
|
||||
" parse_time = time.time() - start_time\n",
|
||||
" parse_speed = len(molecules) / parse_time\n",
|
||||
" \n",
|
||||
" print(f\"\\nRDKit解析:\")\n",
|
||||
" print(f\"解析时间: {parse_time:.3f} s\")\n",
|
||||
" print(f\"分子数量: {len(molecules)}\")\n",
|
||||
" print(f\"解析速度: {parse_speed:.1f} mol/s\")\n",
|
||||
" \n",
|
||||
" # 计算I/O占比\n",
|
||||
" io_percentage = (raw_read_time / parse_time) * 100\n",
|
||||
" print(f\"\\nI/O时间占比: {io_percentage:.1f}%\")\n",
|
||||
" \n",
|
||||
" if io_percentage > 50:\n",
|
||||
" print(\"⚠️ I/O是主要瓶颈,建议:\")\n",
|
||||
" print(\" - 使用更快的存储(SSD)\")\n",
|
||||
" print(\" - 预处理SDF文件为其他格式\")\n",
|
||||
" print(\" - 使用内存映射\")\n",
|
||||
" else:\n",
|
||||
" print(\"✅ 计算是主要瓶颈,并行化有效\")\n",
|
||||
"\n",
|
||||
"analyze_io_bottleneck()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. 优化建议"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def provide_optimization_recommendations():\n",
|
||||
" \"\"\"提供优化建议\"\"\"\n",
|
||||
" \n",
|
||||
" print(\"=== RDKit并行处理优化建议 ===\")\n",
|
||||
" \n",
|
||||
" recommendations = [\n",
|
||||
" {\n",
|
||||
" \"问题\": \"SDF文件读取慢\",\n",
|
||||
" \"原因\": \"RDKit需要解析分子结构,sanitization耗时\",\n",
|
||||
" \"解决方案\": [\n",
|
||||
" \"1. 预处理SDF为pickle/feather格式\",\n",
|
||||
" \"2. 禁用不必要的sanitization\",\n",
|
||||
" \"3. 使用更快的存储介质\",\n",
|
||||
" \"4. 分批读取避免内存溢出\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"问题\": \"并行效率低\",\n",
|
||||
" \"原因\": \"RDKit的GIL限制和进程间通信开销\",\n",
|
||||
" \"解决方案\": [\n",
|
||||
" \"1. 使用多进程而非多线程\",\n",
|
||||
" \"2. 增大每个任务的粒度\",\n",
|
||||
" \"3. 预编译SMARTS模式\",\n",
|
||||
" \"4. 使用loky backend减少开销\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"问题\": \"内存使用过高\",\n",
|
||||
" \"原因\": \"分子对象在内存中占用空间大\",\n",
|
||||
" \"解决方案\": [\n",
|
||||
" \"1. 分批处理\",\n",
|
||||
" \"2. 及时释放不需要的分子\",\n",
|
||||
" \"3. 使用生成器而非列表\",\n",
|
||||
" \"4. 考虑使用SMILES字符串替代分子对象\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
" \n",
|
||||
" for i, rec in enumerate(recommendations, 1):\n",
|
||||
" print(f\"\\n{i}. {rec['问题']}\")\n",
|
||||
" print(f\" 原因: {rec['原因']}\")\n",
|
||||
" print(f\" 解决方案:\")\n",
|
||||
" for solution in rec['解决方案']:\n",
|
||||
" print(f\" {solution}\")\n",
|
||||
" \n",
|
||||
" print(f\"\\n=== 最佳实践总结 ===\")\n",
|
||||
" print(\"1. 🚀 对于大量小分子: 并行处理效果好\")\n",
|
||||
" print(\"2. 📁 对于大文件: 预处理和分批读取更重要\")\n",
|
||||
" print(\"3. 💾 内存受限: 使用流式处理和生成器\")\n",
|
||||
" print(\"4. ⚡ CPU密集: 适当增加并行度,但避免过度并行\")\n",
|
||||
" print(\"5. 🔄 I/O密集: 优化存储和文件格式\")\n",
|
||||
"\n",
|
||||
"provide_optimization_recommendations()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 结论\n",
|
||||
"\n",
|
||||
"### RDKit + Joblib 并行性分析:\n",
|
||||
"\n",
|
||||
"1. **✅ 可以并行**: RDKit的分子匹配是CPU密集型任务,适合并行化\n",
|
||||
"2. **⚠️ 有效但有局限**: 由于GIL和进程开销,并行效率通常在60-80%\n",
|
||||
"3. **📊 最佳并行度**: 通常为CPU核心数的50-80%\n",
|
||||
"\n",
|
||||
"### SDF读取性能:\n",
|
||||
"\n",
|
||||
"1. **🐌 相对较慢**: SDF解析需要分子sanitization,比纯文本读取慢10-100倍\n",
|
||||
"2. **💾 内存密集**: 每个分子对象在内存中占用较大空间\n",
|
||||
"3. **🔄 I/O瓶颈**: 对于大文件,I/O可能成为主要瓶颈\n",
|
||||
"\n",
|
||||
"### 推荐策略:\n",
|
||||
"\n",
|
||||
"- **小文件+多模式**: 并行处理效果好\n",
|
||||
"- **大文件+少模式**: 优化I/O更重要\n",
|
||||
"- **混合策略**: 预处理 + 并行匹配 + 分批处理"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "search_macro",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
28085
notebooks/05_macrolactone_complete_fix.ipynb
Normal file
28085
notebooks/05_macrolactone_complete_fix.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
1151
notebooks/06_macrolactone_filtering_fixed.ipynb
Normal file
1151
notebooks/06_macrolactone_filtering_fixed.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
3105
notebooks/07_visualize_macrolactone_rings.ipynb
Normal file
3105
notebooks/07_visualize_macrolactone_rings.ipynb
Normal file
File diff suppressed because one or more lines are too long
5229
notebooks/filter_molecules.ipynb
Normal file
5229
notebooks/filter_molecules.ipynb
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user