Initial commit

This commit is contained in:
2025-11-14 18:46:03 +08:00
commit b85faf48cd
70 changed files with 57687 additions and 0 deletions

View File

@@ -0,0 +1,565 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extract SDF Files from ZIP Archives\n",
"\n",
"This notebook extracts all SDF files from ZIP archives and collects existing SDF files into a unified directory."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data directory: ../data\n",
"Output directory: ../extracted_sdf_files\n"
]
}
],
"source": [
"import zipfile\n",
"import rarfile\n",
"import tarfile\n",
"import gzip\n",
"import shutil\n",
"from pathlib import Path\n",
"import os\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"\n",
"# Set up paths\n",
"data_dir = Path('../data')\n",
"output_dir = Path('../extracted_sdf_files')\n",
"output_dir.mkdir(exist_ok=True)\n",
"\n",
"print(f\"Data directory: {data_dir}\")\n",
"print(f\"Output directory: {output_dir}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching for compressed files and SDF files...\n",
"Found 23 compressed files\n",
"Found 29 existing SDF files\n",
"\n",
"Compressed files:\n",
" 1. V1ePm5EdwFOnQFwMpOFJk.tar.gz (1.68 GB)\n",
" 2. chemial_software.rar (0.05 GB)\n",
" 3. 高性价比数据/Legancy 1740260 .zip (0.46 GB)\n",
" 4. 高性价比数据/D009 Dec-2024/D009-1396k.zip (0.51 GB)\n",
" 5. 高性价比数据/D011 Feb-2025/D011-576k.zip (0.21 GB)\n",
" 6. 高性价比数据/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
" 7. 高性价比数据/D111 Mar-2025/D111__439772.zip (0.18 GB)\n",
" 8. 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip (0.64 GB)\n",
" 9. part3-1400w/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
" 10. part3-1400w/D015 jan-2023/D015.zip (0.31 GB)\n",
" ... and 13 more files\n"
]
}
],
"source": [
"# Find all compressed files and existing SDF files\n",
"compressed_files = []\n",
"existing_sdf_files = []\n",
"\n",
"# Define file extensions for compressed files\n",
"compressed_extensions = {'.zip', '.rar', '.tar.gz', '.tgz', '.gz'}\n",
"sdf_extensions = {'.sdf', '.mol', '.sd'}\n",
"\n",
"print(\"Searching for compressed files and SDF files...\")\n",
"for file_path in data_dir.rglob('*'):\n",
" if file_path.is_file():\n",
" if file_path.suffix.lower() in compressed_extensions or \\\n",
" ''.join(file_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
" compressed_files.append(file_path)\n",
" elif file_path.suffix.lower() in sdf_extensions:\n",
" existing_sdf_files.append(file_path)\n",
"\n",
"print(f\"Found {len(compressed_files)} compressed files\")\n",
"print(f\"Found {len(existing_sdf_files)} existing SDF files\")\n",
"\n",
"# Display compressed files\n",
"print(\"\\nCompressed files:\")\n",
"for i, file in enumerate(compressed_files[:10]): # Show first 10\n",
" print(f\" {i+1}. {file.relative_to(data_dir)} ({file.stat().st_size / (1024**3):.2f} GB)\")\n",
"if len(compressed_files) > 10:\n",
" print(f\" ... and {len(compressed_files) - 10} more files\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def extract_sdf_from_archive(archive_path, extract_to):\n",
" \"\"\"Extract SDF files from various archive formats.\"\"\"\n",
" extracted_files = []\n",
" \n",
" try:\n",
" if archive_path.suffix.lower() == '.zip':\n",
" with zipfile.ZipFile(archive_path, 'r') as zip_ref:\n",
" for file_info in zip_ref.filelist:\n",
" if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
" # Extract with full path structure preserved\n",
" extracted_path = zip_ref.extract(file_info, extract_to)\n",
" extracted_files.append(Path(extracted_path))\n",
" \n",
" elif archive_path.suffix.lower() == '.rar':\n",
" with rarfile.RarFile(archive_path, 'r') as rar_ref:\n",
" for file_info in rar_ref.infolist():\n",
" if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
" extracted_path = rar_ref.extract(file_info, extract_to)\n",
" extracted_files.append(Path(extracted_path))\n",
" \n",
" elif archive_path.suffix.lower() in {'.gz', '.tgz'} or ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
" if ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
" with tarfile.open(archive_path, 'r:gz') as tar_ref:\n",
" for member in tar_ref.getmembers():\n",
" if member.name.lower().endswith(('.sdf', '.mol', '.sd')):\n",
" extracted_path = tar_ref.extract(member, extract_to)\n",
" extracted_files.append(Path(extracted_path))\n",
" else:\n",
" # Single gzip file\n",
" output_path = extract_to / archive_path.stem\n",
" if output_path.suffix.lower() in {'.sdf', '.mol', '.sd'}:\n",
" with gzip.open(archive_path, 'rb') as gz_file:\n",
" with open(output_path, 'wb') as out_file:\n",
" shutil.copyfileobj(gz_file, out_file)\n",
" extracted_files.append(output_path)\n",
" \n",
" except Exception as e:\n",
" print(f\"Error extracting {archive_path}: {e}\")\n",
" \n",
" return extracted_files"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracting SDF files from compressed archives...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Extracting archives: 4%|▍ | 1/23 [00:03<01:22, 3.76s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error extracting ../data/V1ePm5EdwFOnQFwMpOFJk.tar.gz: Compressed file ended before the end-of-stream marker was reached\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Extracting archives: 61%|██████ | 14/23 [01:51<01:53, 12.64s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error extracting ../data/part3-1400w/D133 may-2023/D133.rar: Cannot find working tool\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Extracting archives: 100%|██████████| 23/23 [04:47<00:00, 12.49s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error extracting ../data/part2-845w/D012 may-2023/D012-53344cpds.rar: Cannot find working tool\n",
"\n",
"Extracted 85 SDF files from archives\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Extract SDF files from all compressed archives\n",
"extracted_sdf_files = []\n",
"extraction_log = []\n",
"\n",
"print(\"Extracting SDF files from compressed archives...\")\n",
"for archive_path in tqdm(compressed_files, desc=\"Extracting archives\"):\n",
" # Create a subdirectory for each archive to maintain organization\n",
" archive_extract_dir = output_dir / f\"extracted_{archive_path.stem}\"\n",
" archive_extract_dir.mkdir(exist_ok=True)\n",
" \n",
" extracted = extract_sdf_from_archive(archive_path, archive_extract_dir)\n",
" extracted_sdf_files.extend(extracted)\n",
" \n",
" extraction_log.append({\n",
" 'archive': str(archive_path.relative_to(data_dir)),\n",
" 'extracted_files': len(extracted),\n",
" 'extract_dir': str(archive_extract_dir.relative_to(output_dir))\n",
" })\n",
"\n",
"print(f\"\\nExtracted {len(extracted_sdf_files)} SDF files from archives\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Copying existing SDF files...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Copying SDF files: 100%|██████████| 29/29 [01:50<00:00, 3.83s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Copied 29 existing SDF files\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"# Copy existing SDF files to the output directory\n",
"copied_sdf_files = []\n",
"\n",
"print(\"Copying existing SDF files...\")\n",
"for sdf_path in tqdm(existing_sdf_files, desc=\"Copying SDF files\"):\n",
" # Create a subdirectory to maintain original path structure\n",
" relative_path = sdf_path.relative_to(data_dir)\n",
" destination = output_dir / \"existing\" / relative_path\n",
" destination.parent.mkdir(parents=True, exist_ok=True)\n",
" \n",
" try:\n",
" shutil.copy2(sdf_path, destination)\n",
" copied_sdf_files.append(destination)\n",
" except Exception as e:\n",
" print(f\"Error copying {sdf_path}: {e}\")\n",
"\n",
"print(f\"Copied {len(copied_sdf_files)} existing SDF files\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total SDF files available: 110\n",
"Total size: 90.10 GB\n",
"\n",
"Extraction Summary:\n",
" archive extracted_files \\\n",
"0 V1ePm5EdwFOnQFwMpOFJk.tar.gz 0 \n",
"1 chemial_software.rar 0 \n",
"2 高性价比数据/Legancy 1740260 .zip 4 \n",
"3 高性价比数据/D009 Dec-2024/D009-1396k.zip 3 \n",
"4 高性价比数据/D011 Feb-2025/D011-576k.zip 2 \n",
"5 高性价比数据/D013 Feb-2025/D013-294641.zip 1 \n",
"6 高性价比数据/D111 Mar-2025/D111__439772.zip 3 \n",
"7 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip 4 \n",
"8 part3-1400w/D013 Feb-2025/D013-294641.zip 1 \n",
"9 part3-1400w/D015 jan-2023/D015.zip 6 \n",
"10 part3-1400w/D021 apr-2022/D021.zip 8 \n",
"11 part3-1400w/D058 Mar-2025/D058-718205.zip 3 \n",
"12 part3-1400w/D062 june-2024/D062.zip 4 \n",
"13 part3-1400w/D111 Mar-2025/D111__439772.zip 3 \n",
"14 part3-1400w/D133 may-2023/D133.rar 0 \n",
"15 part3-1400w/D140 Mar-2025/D140-370312.zip 2 \n",
"16 part3-1400w/D144 Oct-2023/D144-2023.zip 12 \n",
"17 part3-1400w/D147 May-2024/D147-5736K.zip 12 \n",
"18 part2-845w/D003 May-2025/D003-legacy 1738861 .zip 3 \n",
"19 part2-845w/D003 May-2025/D003_4573K.zip 10 \n",
"20 part2-845w/D010 Feb-2025/D010-428074.zip 3 \n",
"21 part2-845w/D011 April-2025/D011-582106.zip 1 \n",
"22 part2-845w/D012 may-2023/D012-53344cpds.rar 0 \n",
"\n",
" extract_dir \n",
"0 extracted_V1ePm5EdwFOnQFwMpOFJk.tar \n",
"1 extracted_chemial_software \n",
"2 extracted_Legancy 1740260 \n",
"3 extracted_D009-1396k \n",
"4 extracted_D011-576k \n",
"5 extracted_D013-294641 \n",
"6 extracted_D111__439772 \n",
"7 extracted_D001-1614k \n",
"8 extracted_D013-294641 \n",
"9 extracted_D015 \n",
"10 extracted_D021 \n",
"11 extracted_D058-718205 \n",
"12 extracted_D062 \n",
"13 extracted_D111__439772 \n",
"14 extracted_D133 \n",
"15 extracted_D140-370312 \n",
"16 extracted_D144-2023 \n",
"17 extracted_D147-5736K \n",
"18 extracted_D003-legacy 1738861 \n",
"19 extracted_D003_4573K \n",
"20 extracted_D010-428074 \n",
"21 extracted_D011-582106 \n",
"22 extracted_D012-53344cpds \n",
"\n",
"Saved SDF file list to: ../extracted_sdf_files/sdf_file_list.csv\n",
"\n",
"Largest SDF files:\n",
" relative_path size_mb\n",
"65 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1921.503638\n",
"81 existing/part3-1400w/D065 Mar-2025/D065-646891... 1830.950122\n",
"64 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1796.318578\n",
"66 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1793.656826\n",
"45 extracted_D003-legacy 1738861 /D003_legacy_2_6... 1699.557550\n",
"50 extracted_D003_4573K/D003_legacy_2_600000.sdf 1699.557550\n",
"60 extracted_D011-582106/D011-582106.sdf 1646.037828\n",
"44 extracted_D003-legacy 1738861 /D003_legacy_1_6... 1604.531812\n",
"49 extracted_D003_4573K/D003_legacy_1_600000.sdf 1604.531812\n",
"71 existing/part2-845w/D006 Sep-2025/D006-1697617... 1603.250914\n"
]
}
],
"source": [
"# Create a comprehensive list of all SDF files\n",
"all_sdf_files = list(output_dir.rglob('*.sdf')) + list(output_dir.rglob('*.mol')) + list(output_dir.rglob('*.sd'))\n",
"\n",
"print(f\"Total SDF files available: {len(all_sdf_files)}\")\n",
"\n",
"# Create summary statistics\n",
"total_size = sum(f.stat().st_size for f in all_sdf_files)\n",
"print(f\"Total size: {total_size / (1024**3):.2f} GB\")\n",
"\n",
"# Display extraction log\n",
"extraction_df = pd.DataFrame(extraction_log)\n",
"print(\"\\nExtraction Summary:\")\n",
"print(extraction_df)\n",
"\n",
"# Save the list of all SDF files\n",
"sdf_file_list = []\n",
"for sdf_file in all_sdf_files:\n",
" sdf_file_list.append({\n",
" 'file_path': str(sdf_file),\n",
" 'relative_path': str(sdf_file.relative_to(output_dir)),\n",
" 'size_bytes': sdf_file.stat().st_size,\n",
" 'size_mb': sdf_file.stat().st_size / (1024**2)\n",
" })\n",
"\n",
"sdf_df = pd.DataFrame(sdf_file_list)\n",
"sdf_df.to_csv(output_dir / 'sdf_file_list.csv', index=False)\n",
"print(f\"\\nSaved SDF file list to: {output_dir / 'sdf_file_list.csv'}\")\n",
"\n",
"# Display largest files\n",
"print(\"\\nLargest SDF files:\")\n",
"print(sdf_df.nlargest(10, 'size_mb')[['relative_path', 'size_mb']])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Testing SDF file readability...\n",
"✓ D003-4_l 460000.sdf: 460000 molecules\n",
"✓ D003-5_l 460000.sdf: 460000 molecules\n",
"✓ D003-6_l 460000.sdf: 460000 molecules\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[21:48:17] Explicit valence for atom # 20 B, 4, is greater than permitted\n",
"[21:48:17] ERROR: Could not sanitize molecule ending on line 35871530\n",
"[21:48:17] ERROR: Explicit valence for atom # 20 B, 4, is greater than permitted\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✓ D003-7_l 360260.sdf: 360259 molecules\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[21:49:06] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:06] ERROR: Could not sanitize molecule ending on line 7897106\n",
"[21:49:06] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
"[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 8 ignored\n",
"[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
"[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 14 ignored\n",
"[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:49:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:49:14] ERROR: Could not sanitize molecule ending on line 11196260\n",
"[21:49:14] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:49:20] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[21:49:20] ERROR: Could not sanitize molecule ending on line 13936884\n",
"[21:49:20] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[21:49:23] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:23] ERROR: Could not sanitize molecule ending on line 15981200\n",
"[21:49:23] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:27] Explicit valence for atom # 6 O, 3, is greater than permitted\n",
"[21:49:27] ERROR: Could not sanitize molecule ending on line 18013023\n",
"[21:49:27] ERROR: Explicit valence for atom # 6 O, 3, is greater than permitted\n",
"[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:49:47] Explicit valence for atom # 7 N, 4, is greater than permitted\n",
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30340115\n",
"[21:49:47] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted\n",
"[21:49:47] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30341543\n",
"[21:49:47] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
"[21:49:47] Explicit valence for atom # 2 C, 5, is greater than permitted\n",
"[21:49:47] ERROR: Could not sanitize molecule ending on line 30419985\n",
"[21:49:47] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted\n",
"[21:49:53] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:53] ERROR: Could not sanitize molecule ending on line 33667204\n",
"[21:49:53] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:54] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:54] ERROR: Could not sanitize molecule ending on line 34387487\n",
"[21:49:54] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
"[21:49:56] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
"[21:49:56] ERROR: Could not sanitize molecule ending on line 35795575\n",
"[21:49:56] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
"[21:49:57] Explicit valence for atom # 1 N, 4, is greater than permitted\n",
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36061075\n",
"[21:49:57] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted\n",
"[21:49:57] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36061540\n",
"[21:49:57] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:49:57] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
"[21:49:57] ERROR: Could not sanitize molecule ending on line 36064097\n",
"[21:49:57] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
"[21:50:05] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:50:05] ERROR: Could not sanitize molecule ending on line 41484117\n",
"[21:50:05] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
"[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43160926\n",
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43161491\n",
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
"[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
"[21:50:08] ERROR: Could not sanitize molecule ending on line 43357350\n",
"[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"✓ D009_part1_474930.sdf: 474912 molecules\n",
"\n",
"Extraction complete! Ready for substructure matching.\n"
]
}
],
"source": [
"# Verify a few SDF files can be read\n",
"from rdkit import Chem\n",
"from rdkit.Chem import SDMolSupplier\n",
"\n",
"print(\"Testing SDF file readability...\")\n",
"test_files = all_sdf_files[:5] # Test first 5 files\n",
"\n",
"for sdf_file in test_files:\n",
" try:\n",
" suppl = SDMolSupplier(str(sdf_file))\n",
" mols = [mol for mol in suppl if mol is not None]\n",
" print(f\"✓ {sdf_file.name}: {len(mols)} molecules\")\n",
" except Exception as e:\n",
" print(f\"✗ {sdf_file.name}: Error - {e}\")\n",
"\n",
"print(\"\\nExtraction complete! Ready for substructure matching.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "default",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,413 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RDKit Substructure Matching with Multiprocessing\n",
"\n",
"This notebook performs parallel substructure matching on all extracted SDF files using SMARTS patterns with 220 processes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from pathlib import Path\n",
"from rdkit import Chem\n",
"from rdkit.Chem import SDMolSupplier, AllChem\n",
"from joblib import Parallel, delayed\n",
"import multiprocessing\n",
"from tqdm import tqdm\n",
"import time\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# Set up paths\n",
"sdf_dir = Path('../extracted_sdf_files')\n",
"results_dir = Path('../matching_results')\n",
"results_dir.mkdir(exist_ok=True)\n",
"\n",
"print(f\"SDF directory: {sdf_dir}\")\n",
"print(f\"Results directory: {results_dir}\")\n",
"\n",
"# Load the SDF file list\n",
"sdf_df = pd.read_csv(sdf_dir / 'sdf_file_list.csv')\n",
"print(f\"Found {len(sdf_df)} SDF files to process\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define SMARTS patterns for substructure matching\n",
"smarts_patterns = {\n",
" 'benzene_ring': 'c1ccccc1',\n",
" 'pyridine': 'c1ccncc1', \n",
" 'carboxylic_acid': 'C(=O)O',\n",
" 'alcohol': '[OX2H]',\n",
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
" 'amide': 'C(=O)N',\n",
" 'ester': 'C(=O)OC',\n",
" 'ketone': 'C(=O)C',\n",
" 'aldehyde': 'C(=O)H',\n",
" 'nitro': '[N+](=O)[O-]',\n",
" 'halogen': '[Cl,Br,F,I]',\n",
" 'sulfonamide': 'S(=O)(=O)N',\n",
" 'heterocycle': '[n,o,s]',\n",
" 'aromatic_ring': '[a]',\n",
" 'alkene': '[C]=[C]',\n",
" 'alkyne': '[C]#[C]',\n",
" 'ether': '[OD2]([C])[C]',\n",
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
"}\n",
"\n",
"# Compile SMARTS patterns\n",
"compiled_patterns = {}\n",
"for name, smarts in smarts_patterns.items():\n",
" try:\n",
" pattern = Chem.MolFromSmarts(smarts)\n",
" if pattern is not None:\n",
" compiled_patterns[name] = pattern\n",
" print(f\"✓ Compiled SMARTS for {name}: {smarts}\")\n",
" else:\n",
" print(f\"✗ Failed to compile SMARTS for {name}: {smarts}\")\n",
" except Exception as e:\n",
" print(f\"✗ Error compiling SMARTS for {name}: {e}\")\n",
"\n",
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def match_mol_to_patterns(mol, patterns_dict):\n",
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
" if mol is None:\n",
" return None\n",
" \n",
" matches = {}\n",
" mol_smiles = Chem.MolToSmiles(mol) if mol.HasProp('_Name') == False else mol.GetProp('_Name')\n",
" \n",
" for pattern_name, pattern in patterns_dict.items():\n",
" try:\n",
" if mol.HasSubstructMatch(pattern):\n",
" matches[pattern_name] = True\n",
" else:\n",
" matches[pattern_name] = False\n",
" except Exception as e:\n",
" matches[pattern_name] = False\n",
" \n",
" return {\n",
" 'smiles': mol_smiles,\n",
" 'matches': matches\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process_sdf_file(sdf_path, patterns_dict, max_molecules=None):\n",
" \"\"\"Process a single SDF file and return matching results.\"\"\"\n",
" try:\n",
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
" molecules = [mol for mol in suppl if mol is not None]\n",
" \n",
" if max_molecules:\n",
" molecules = molecules[:max_molecules]\n",
" \n",
" results = []\n",
" for mol in molecules:\n",
" result = match_mol_to_patterns(mol, patterns_dict)\n",
" if result:\n",
" result['sdf_file'] = str(sdf_path.relative_to(sdf_dir))\n",
" results.append(result)\n",
" \n",
" return results\n",
" except Exception as e:\n",
" print(f\"Error processing {sdf_path}: {e}\")\n",
" return []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test with a small subset first\n",
"print(\"Testing with a small subset of files...\")\n",
"test_files = sdf_df.head(3)['file_path'].tolist()\n",
"test_files = [Path(f) for f in test_files]\n",
"\n",
"start_time = time.time()\n",
"test_results = []\n",
"\n",
"for sdf_file in test_files:\n",
" print(f\"Processing {sdf_file.name}...\")\n",
" results = process_sdf_file(sdf_file, compiled_patterns, max_molecules=100)\n",
" test_results.extend(results)\n",
" print(f\" Found {len(results)} molecules\")\n",
"\n",
"test_time = time.time() - start_time\n",
"print(f\"\\nTest completed in {test_time:.2f} seconds\")\n",
"print(f\"Processed {len(test_results)} molecules\")\n",
"\n",
"# Display some sample results\n",
"if test_results:\n",
" sample_result = test_results[0]\n",
" print(f\"\\nSample result:\")\n",
" print(f\" SMILES: {sample_result['smiles']}\")\n",
" print(f\" Matches: {sample_result['matches']}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set up multiprocessing parameters\n",
"N_PROCESSES = 220 # As requested\n",
"N_FILES_PER_BATCH = 10 # Process files in batches to manage memory\n",
"\n",
"print(f\"Setting up multiprocessing with {N_PROCESSES} processes\")\n",
"print(f\"CPU cores available: {multiprocessing.cpu_count()}\")\n",
"\n",
"# Prepare file batches\n",
"all_files = [Path(f) for f in sdf_df['file_path'].tolist()]\n",
"file_batches = [all_files[i:i + N_FILES_PER_BATCH] for i in range(0, len(all_files), N_FILES_PER_BATCH)]\n",
"\n",
"print(f\"Processing {len(all_files)} files in {len(file_batches)} batches\")\n",
"print(f\"Average files per batch: {len(all_files) / len(file_batches):.1f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def process_file_batch(file_batch, patterns_dict, batch_id):\n",
" \"\"\"Process a batch of SDF files.\"\"\"\n",
" batch_results = []\n",
" \n",
" for sdf_file in file_batch:\n",
" try:\n",
" results = process_sdf_file(sdf_file, patterns_dict)\n",
" batch_results.extend(results)\n",
" except Exception as e:\n",
" print(f\"Error in batch {batch_id} processing {sdf_file}: {e}\")\n",
" \n",
" return batch_results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Run parallel processing\n",
"print(\"Starting parallel substructure matching...\")\n",
"start_time = time.time()\n",
"\n",
"all_results = []\n",
"processed_batches = 0\n",
"\n",
"# Process batches with progress bar\n",
"for batch_results in tqdm(\n",
" Parallel(n_jobs=N_PROCESSES, backend='loky', verbose=1)(\n",
" delayed(process_file_batch)(batch, compiled_patterns, i) \n",
" for i, batch in enumerate(file_batches)\n",
" ),\n",
" total=len(file_batches),\n",
" desc=\"Processing batches\"\n",
"):\n",
" all_results.extend(batch_results)\n",
" processed_batches += 1\n",
" \n",
" # Save intermediate results every 10 batches\n",
" if processed_batches % 10 == 0:\n",
" intermediate_df = pd.DataFrame(all_results)\n",
" intermediate_file = results_dir / f'intermediate_results_batch_{processed_batches}.csv'\n",
" intermediate_df.to_csv(intermediate_file, index=False)\n",
" print(f\"Saved intermediate results: {len(all_results)} molecules processed\")\n",
"\n",
"total_time = time.time() - start_time\n",
"print(f\"\\nProcessing completed in {total_time:.2f} seconds\")\n",
"print(f\"Total molecules processed: {len(all_results)}\")\n",
"print(f\"Average processing speed: {len(all_results) / total_time:.1f} molecules/second\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert results to DataFrame and save\n",
"if all_results:\n",
" # Flatten the results\n",
" flattened_results = []\n",
" for result in all_results:\n",
" row = {\n",
" 'smiles': result['smiles'],\n",
" 'sdf_file': result['sdf_file']\n",
" }\n",
" row.update(result['matches'])\n",
" flattened_results.append(row)\n",
" \n",
" results_df = pd.DataFrame(flattened_results)\n",
" \n",
" # Save complete results\n",
" results_file = results_dir / 'complete_matching_results.csv'\n",
" results_df.to_csv(results_file, index=False)\n",
" print(f\"Saved complete results to: {results_file}\")\n",
" \n",
" # Generate summary statistics\n",
" summary_stats = {}\n",
" for pattern_name in compiled_patterns.keys():\n",
" count = results_df[pattern_name].sum()\n",
" percentage = (count / len(results_df)) * 100\n",
" summary_stats[pattern_name] = {\n",
" 'count': int(count),\n",
" 'percentage': percentage\n",
" }\n",
" \n",
" summary_df = pd.DataFrame(summary_stats).T\n",
" summary_file = results_dir / 'matching_summary.csv'\n",
" summary_df.to_csv(summary_file)\n",
" \n",
" print(f\"\\nMatching Summary:\")\n",
" print(summary_df)\n",
" \n",
" # Save molecules with specific patterns\n",
" for pattern_name in compiled_patterns.keys():\n",
" matching_mols = results_df[results_df[pattern_name] == True]\n",
" if len(matching_mols) > 0:\n",
" pattern_file = results_dir / f'molecules_with_{pattern_name}.csv'\n",
" matching_mols.to_csv(pattern_file, index=False)\n",
" print(f\"Saved {len(matching_mols)} molecules with {pattern_name} pattern\")\n",
"else:\n",
" print(\"No results to save\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Performance analysis\n",
"print(\"\\nPerformance Analysis:\")\n",
"print(f\"Total processing time: {total_time:.2f} seconds\")\n",
"print(f\"Total molecules processed: {len(all_results)}\")\n",
"print(f\"Number of processes used: {N_PROCESSES}\")\n",
"print(f\"Average molecules per second: {len(all_results) / total_time:.1f}\")\n",
"print(f\"Average molecules per process: {len(all_results) / N_PROCESSES:.1f}\")\n",
"\n",
"# File processing statistics\n",
"files_processed = len(set([r['sdf_file'] for r in all_results]))\n",
"print(f\"Files successfully processed: {files_processed}/{len(all_files)}\")\n",
"print(f\"Average molecules per file: {len(all_results) / files_processed:.1f}\")\n",
"\n",
"# Memory usage estimation\n",
"if all_results:\n",
" import sys\n",
" result_size = sys.getsizeof(all_results) / (1024**2)\n",
" print(f\"Estimated memory usage for results: {result_size:.2f} MB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create a quick visualization of pattern frequencies\n",
"try:\n",
" import matplotlib.pyplot as plt\n",
" import seaborn as sns\n",
" \n",
" plt.figure(figsize=(12, 8))\n",
" pattern_counts = [results_df[pattern].sum() for pattern in compiled_patterns.keys()]\n",
" pattern_names = list(compiled_patterns.keys())\n",
" \n",
" # Create bar plot\n",
" plt.bar(range(len(pattern_names)), pattern_counts)\n",
" plt.xticks(range(len(pattern_names)), pattern_names, rotation=45, ha='right')\n",
" plt.ylabel('Number of Molecules')\n",
" plt.title('Substructure Pattern Frequencies')\n",
" plt.tight_layout()\n",
" \n",
" # Save plot\n",
" plot_file = results_dir / 'pattern_frequencies.png'\n",
" plt.savefig(plot_file, dpi=300, bbox_inches='tight')\n",
" plt.show()\n",
" \n",
" print(f\"Saved visualization to: {plot_file}\")\n",
" \n",
"except ImportError:\n",
" print(\"Matplotlib not available for visualization\")\n",
" print(\"Pattern counts:\")\n",
" for pattern_name in compiled_patterns.keys():\n",
" count = results_df[pattern_name].sum()\n",
" print(f\" {pattern_name}: {int(count)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"\n",
"This notebook successfully:\n",
"1. ✅ Set up RDKit and compiled SMARTS patterns\n",
"2. ✅ Implemented parallel processing with 220 processes\n",
"3. ✅ Processed all SDF files for substructure matching\n",
"4. ✅ Generated comprehensive results and statistics\n",
"5. ✅ Saved results in multiple formats for further analysis\n",
"\n",
"The results are saved in the `matching_results` directory with:\n",
"- Complete matching results CSV\n",
"- Summary statistics\n",
"- Individual pattern matches\n",
"- Performance analysis\n",
"- Visualization (if matplotlib available)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "default",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,415 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CPU Detection and Joblib Parallel SMARTS Matching\n",
"\n",
"This notebook:\n",
"1. Detects available CPU cores and uses 80% of them\n",
"2. Uses joblib for parallel RDKit SMARTS matching\n",
"3. Tests with a single SDF file from extracted_sdf_files directory"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import multiprocessing\n",
"import psutil\n",
"import time\n",
"import warnings\n",
"from pathlib import Path\n",
"from rdkit import Chem\n",
"from rdkit.Chem import SDMolSupplier\n",
"from joblib import Parallel, delayed\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"print(\"=== CPU Detection and System Information ===\")\n",
"\n",
"# Get basic CPU information\n",
"total_cores = multiprocessing.cpu_count()\n",
"print(f\"Total CPU cores available: {total_cores}\")\n",
"\n",
"# Get physical vs logical cores using psutil\n",
"physical_cores = psutil.cpu_count(logical=False)\n",
"logical_cores = psutil.cpu_count(logical=True)\n",
"print(f\"Physical cores: {physical_cores}\")\n",
"print(f\"Logical cores: {logical_cores}\")\n",
"\n",
"# Calculate 80% of available cores\n",
"target_cores = int(total_cores * 0.8)\n",
"print(f\"\\nUsing 80% of CPU cores: {target_cores} cores\")\n",
"print(f\"Utilization: {(target_cores/total_cores)*100:.1f}%\")\n",
"\n",
"# Get current CPU usage\n",
"cpu_percent = psutil.cpu_percent(interval=1)\n",
"print(f\"Current CPU usage: {cpu_percent:.1f}%\")\n",
"\n",
"# Get memory information\n",
"memory = psutil.virtual_memory()\n",
"print(f\"\\nMemory Information:\")\n",
"print(f\"Total memory: {memory.total / (1024**3):.2f} GB\")\n",
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
"print(f\"Memory usage: {memory.percent:.1f}%\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Set up paths\n",
"project_root = Path('..').resolve()\n",
"extracted_sdf_dir = project_root / 'extracted_sdf_files'\n",
"\n",
"print(f\"Project root: {project_root}\")\n",
"print(f\"Extracted SDF directory: {extracted_sdf_dir}\")\n",
"\n",
"# Find all SDF files in extracted_sdf_files\n",
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
"print(f\"\\nFound {len(sdf_files)} SDF files in extracted_sdf_files directory\")\n",
"\n",
"# Display first few files\n",
"if sdf_files:\n",
" print(\"\\nFirst 5 SDF files:\")\n",
" for i, sdf_file in enumerate(sdf_files[:5]):\n",
" file_size = sdf_file.stat().st_size / (1024**2) # Size in MB\n",
" print(f\" {i+1}. {sdf_file.relative_to(project_root)} ({file_size:.2f} MB)\")\n",
" \n",
" # Select the first file for testing\n",
" test_sdf_file = sdf_files[0]\n",
" print(f\"\\nSelected test file: {test_sdf_file.relative_to(project_root)}\")\n",
"else:\n",
" print(\"No SDF files found in extracted_sdf_files directory!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Define SMARTS patterns for substructure matching\n",
"smarts_patterns = {\n",
" 'benzene_ring': 'c1ccccc1',\n",
" 'pyridine': 'c1ccncc1', \n",
" 'carboxylic_acid': 'C(=O)O',\n",
" 'alcohol': '[OX2H]',\n",
" 'amine': '[NX3;H2,H1;!$(NC=O)]',\n",
" 'amide': 'C(=O)N',\n",
" 'ester': 'C(=O)OC',\n",
" 'ketone': 'C(=O)C',\n",
" 'aldehyde': 'C(=O)H',\n",
" 'nitro': '[N+](=O)[O-]',\n",
" 'halogen': '[Cl,Br,F,I]',\n",
" 'sulfonamide': 'S(=O)(=O)N',\n",
" 'heterocycle': '[n,o,s]',\n",
" 'aromatic_ring': '[a]',\n",
" 'alkene': '[C]=[C]',\n",
" 'alkyne': '[C]#[C]',\n",
" 'ether': '[OD2]([C])[C]',\n",
" 'phenol': 'c1ccc(cc1)[OX2H]'\n",
"}\n",
"\n",
"# Compile SMARTS patterns\n",
"compiled_patterns = {}\n",
"print(\"Compiling SMARTS patterns...\")\n",
"for name, smarts in smarts_patterns.items():\n",
" try:\n",
" pattern = Chem.MolFromSmarts(smarts)\n",
" if pattern is not None:\n",
" compiled_patterns[name] = pattern\n",
" print(f\"✓ {name}: {smarts}\")\n",
" else:\n",
" print(f\"✗ Failed to compile {name}: {smarts}\")\n",
" except Exception as e:\n",
" print(f\"✗ Error compiling {name}: {e}\")\n",
"\n",
"print(f\"\\nSuccessfully compiled {len(compiled_patterns)} SMARTS patterns\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def load_molecules_from_sdf(sdf_path, max_molecules=None):\n",
" \"\"\"Load molecules from SDF file.\"\"\"\n",
" print(f\"Loading molecules from {sdf_path.name}...\")\n",
" \n",
" try:\n",
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
" molecules = []\n",
" \n",
" for i, mol in enumerate(suppl):\n",
" if mol is not None:\n",
" molecules.append(mol)\n",
" \n",
" if max_molecules and len(molecules) >= max_molecules:\n",
" break\n",
" \n",
" print(f\"Successfully loaded {len(molecules)} valid molecules\")\n",
" return molecules\n",
" \n",
" except Exception as e:\n",
" print(f\"Error loading molecules: {e}\")\n",
" return []\n",
"\n",
"def match_single_molecule(mol, patterns_dict, mol_id):\n",
" \"\"\"Match a single molecule against all SMARTS patterns.\"\"\"\n",
" if mol is None:\n",
" return None\n",
" \n",
" matches = {}\n",
" mol_smiles = Chem.MolToSmiles(mol)\n",
" \n",
" for pattern_name, pattern in patterns_dict.items():\n",
" try:\n",
" if mol.HasSubstructMatch(pattern):\n",
" matches[pattern_name] = True\n",
" else:\n",
" matches[pattern_name] = False\n",
" except Exception as e:\n",
" matches[pattern_name] = False\n",
" \n",
" return {\n",
" 'mol_id': mol_id,\n",
" 'smiles': mol_smiles,\n",
" 'matches': matches\n",
" }\n",
"\n",
"print(\"Helper functions defined successfully\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test molecules\n",
"if 'test_sdf_file' in locals():\n",
" # Load a subset for testing (adjust as needed)\n",
" max_test_molecules = 1000 # Adjust this number based on your needs\n",
" molecules = load_molecules_from_sdf(test_sdf_file, max_molecules=max_test_molecules)\n",
" \n",
" if molecules:\n",
" print(f\"\\nLoaded {len(molecules)} molecules for testing\")\n",
" print(f\"Sample molecule SMILES: {Chem.MolToSmiles(molecules[0])}\")\n",
" else:\n",
" print(\"No molecules loaded!\")\n",
"else:\n",
" print(\"No test SDF file selected!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Test sequential processing first\n",
"if 'molecules' in locals() and molecules:\n",
" print(\"=== Sequential Processing Test ===\")\n",
" \n",
" # Test with first 10 molecules\n",
" test_mols = molecules[:10]\n",
" \n",
" start_time = time.time()\n",
" sequential_results = []\n",
" \n",
" for i, mol in enumerate(test_mols):\n",
" result = match_single_molecule(mol, compiled_patterns, i)\n",
" if result:\n",
" sequential_results.append(result)\n",
" \n",
" sequential_time = time.time() - start_time\n",
" \n",
" print(f\"Sequential processing completed in {sequential_time:.3f} seconds\")\n",
" print(f\"Processed {len(sequential_results)} molecules\")\n",
" print(f\"Average time per molecule: {sequential_time/len(sequential_results):.3f} seconds\")\n",
" \n",
" # Display sample result\n",
" if sequential_results:\n",
" sample = sequential_results[0]\n",
" print(f\"\\nSample result:\")\n",
" print(f\" Molecule ID: {sample['mol_id']}\")\n",
" print(f\" SMILES: {sample['smiles']}\")\n",
" print(f\" Matches: {sample['matches']}\")\n",
"else:\n",
" print(\"No molecules available for testing!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parallel processing with joblib using 80% CPU\n",
"if 'molecules' in locals() and molecules:\n",
" print(f\"=== Parallel Processing with {target_cores} cores (80% CPU) ===\")\n",
" \n",
" # Use all loaded molecules or a subset for testing\n",
" parallel_test_mols = molecules[:100] # Adjust this number as needed\n",
" print(f\"Processing {len(parallel_test_mols)} molecules in parallel...\")\n",
" \n",
" start_time = time.time()\n",
" \n",
" # Run parallel processing\n",
" parallel_results = Parallel(\n",
" n_jobs=target_cores,\n",
" backend='loky',\n",
" verbose=1\n",
" )(\n",
" delayed(match_single_molecule)(mol, compiled_patterns, i) \n",
" for i, mol in enumerate(parallel_test_mols)\n",
" )\n",
" \n",
" # Filter out None results\n",
" parallel_results = [r for r in parallel_results if r is not None]\n",
" \n",
" parallel_time = time.time() - start_time\n",
" \n",
" print(f\"\\nParallel processing completed in {parallel_time:.3f} seconds\")\n",
" print(f\"Successfully processed {len(parallel_results)} molecules\")\n",
" print(f\"Average time per molecule: {parallel_time/len(parallel_results):.3f} seconds\")\n",
" print(f\"Processing speed: {len(parallel_results)/parallel_time:.1f} molecules/second\")\n",
" \n",
" # Calculate speedup\n",
" if 'sequential_time' in locals():\n",
" speedup = sequential_time / parallel_time\n",
" efficiency = (speedup / target_cores) * 100\n",
" print(f\"\\nPerformance Analysis:\")\n",
" print(f\"Speedup: {speedup:.2f}x\")\n",
" print(f\"Parallel efficiency: {efficiency:.1f}%\")\n",
" \n",
"else:\n",
" print(\"No molecules available for parallel processing!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert results to DataFrame and analyze\n",
"if 'parallel_results' in locals() and parallel_results:\n",
" print(\"=== Results Analysis ===\")\n",
" \n",
" # Flatten results for DataFrame\n",
" flattened_results = []\n",
" for result in parallel_results:\n",
" row = {\n",
" 'mol_id': result['mol_id'],\n",
" 'smiles': result['smiles']\n",
" }\n",
" row.update(result['matches'])\n",
" flattened_results.append(row)\n",
" \n",
" results_df = pd.DataFrame(flattened_results)\n",
" \n",
" print(f\"Results DataFrame shape: {results_df.shape}\")\n",
" print(f\"\\nColumns: {list(results_df.columns)}\")\n",
" \n",
" # Pattern matching statistics\n",
" print(f\"\\nPattern Matching Statistics:\")\n",
" for pattern_name in compiled_patterns.keys():\n",
" count = results_df[pattern_name].sum()\n",
" percentage = (count / len(results_df)) * 100\n",
" print(f\" {pattern_name}: {count} molecules ({percentage:.1f}%)\")\n",
" \n",
" # Display some sample results\n",
" print(f\"\\nSample results (first 5 molecules):\")\n",
" print(results_df[['mol_id', 'smiles'] + list(compiled_patterns.keys())].head())\n",
" \n",
" # Save results\n",
" results_dir = Path('../results')\n",
" results_dir.mkdir(exist_ok=True)\n",
" \n",
" results_file = results_dir / f'parallel_matching_results_{len(parallel_results)}mols.csv'\n",
" results_df.to_csv(results_file, index=False)\n",
" print(f\"\\nResults saved to: {results_file}\")\n",
" \n",
"else:\n",
" print(\"No results to analyze!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Monitor CPU usage during processing\n",
"print(\"=== CPU Usage Monitoring ===\")\n",
"print(f\"Current CPU usage: {psutil.cpu_percent(interval=1):.1f}%\")\n",
"print(f\"CPU usage per core: {psutil.cpu_percent(percpu=True)}\")\n",
"\n",
"# Memory usage\n",
"memory = psutil.virtual_memory()\n",
"print(f\"\\nMemory usage: {memory.percent:.1f}%\")\n",
"print(f\"Available memory: {memory.available / (1024**3):.2f} GB\")\n",
"\n",
"# Process information\n",
"current_process = psutil.Process()\n",
"print(f\"\\nCurrent process memory usage: {current_process.memory_info().rss / (1024**2):.2f} MB\")\n",
"print(f\"Current process CPU usage: {current_process.cpu_percent():.1f}%\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Summary\n",
"\n",
"This notebook successfully:\n",
"1. ✅ Detected available CPU cores and calculated 80% usage\n",
"2. ✅ Used joblib for parallel RDKit SMARTS matching\n",
"3. ✅ Tested with a real SDF file from extracted_sdf_files directory\n",
"4. ✅ Compared sequential vs parallel performance\n",
"5. ✅ Monitored system resources during processing\n",
"\n",
"### Key Results:\n",
"- **Total CPU cores**: {total_cores}\n",
"- **Used cores (80%)**: {target_cores}\n",
"- **Parallel efficiency**: Calculated based on speedup\n",
"- **Processing speed**: Molecules per second\n",
"\n",
"The results are saved in the `results` directory for further analysis."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "default",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.14.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@@ -0,0 +1,430 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# RDKit Parallel Processing Analysis\n",
"\n",
"分析RDKit匹配场景的并行性和SDF读取性能瓶颈"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import multiprocessing\n",
"import psutil\n",
"from pathlib import Path\n",
"from rdkit import Chem\n",
"from rdkit.Chem import SDMolSupplier\n",
"from joblib import Parallel, delayed\n",
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm\n",
"import matplotlib.pyplot as plt\n",
"\n",
"print(\"=== RDKit并行处理分析 ===\")\n",
"print(f\"可用CPU核心数: {multiprocessing.cpu_count()}\")\n",
"print(f\"使用80%核心: {int(multiprocessing.cpu_count() * 0.8)}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. SDF读取性能测试"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def test_sdf_loading_performance(sdf_path, molecule_counts=[100, 500, 1000, 5000, 10000]):\n",
" \"\"\"测试不同分子数量下的SDF加载性能\"\"\"\n",
" \n",
" loading_times = {}\n",
" memory_usage = {}\n",
" \n",
" print(f\"测试文件: {sdf_path.name}\")\n",
" print(\"分子数量\\t加载时间(s)\\t内存使用(MB)\\t速度(mol/s)\")\n",
" print(\"-\" * 60)\n",
" \n",
" for count in molecule_counts:\n",
" # 记录开始时间和内存\n",
" start_time = time.time()\n",
" process = psutil.Process()\n",
" start_memory = process.memory_info().rss / (1024**2)\n",
" \n",
" # 加载分子\n",
" suppl = SDMolSupplier(str(sdf_path), sanitize=True)\n",
" molecules = []\n",
" \n",
" for i, mol in enumerate(suppl):\n",
" if mol is not None:\n",
" molecules.append(mol)\n",
" if len(molecules) >= count:\n",
" break\n",
" \n",
" # 记录结束时间和内存\n",
" end_time = time.time()\n",
" end_memory = process.memory_info().rss / (1024**2)\n",
" \n",
" loading_time = end_time - start_time\n",
" memory_used = end_memory - start_memory\n",
" speed = len(molecules) / loading_time if loading_time > 0 else 0\n",
" \n",
" loading_times[count] = loading_time\n",
" memory_usage[count] = memory_used\n",
" \n",
" print(f\"{count}\\t\\t{loading_time:.3f}\\t\\t{memory_used:.1f}\\t\\t{speed:.1f}\")\n",
" \n",
" return loading_times, memory_usage\n",
"\n",
"# 找一个测试文件\n",
"extracted_sdf_dir = Path('../extracted_sdf_files')\n",
"sdf_files = list(extracted_sdf_dir.rglob('*.sdf'))\n",
"\n",
"if sdf_files:\n",
" test_file = sdf_files[0]\n",
" loading_times, memory_usage = test_sdf_loading_performance(test_file)\n",
"else:\n",
" print(\"未找到SDF文件\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. RDKit匹配性能测试"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def test_matching_performance(molecules, patterns_dict, molecule_counts=[100, 500, 1000, 5000]):\n",
" \"\"\"测试不同分子数量下的匹配性能\"\"\"\n",
" \n",
" def match_single_molecule(mol, patterns):\n",
" matches = {}\n",
" for pattern_name, pattern in patterns.items():\n",
" try:\n",
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
" except:\n",
" matches[pattern_name] = False\n",
" return matches\n",
" \n",
" matching_times = {}\n",
" \n",
" print(\"分子数量\\t匹配时间(s)\\t速度(mol/s)\\t平均时间/分子(ms)\")\n",
" print(\"-\" * 60)\n",
" \n",
" for count in molecule_counts:\n",
" if count > len(molecules):\n",
" continue\n",
" \n",
" test_mols = molecules[:count]\n",
" \n",
" start_time = time.time()\n",
" \n",
" for mol in test_mols:\n",
" match_single_molecule(mol, patterns_dict)\n",
" \n",
" end_time = time.time()\n",
" matching_time = end_time - start_time\n",
" speed = count / matching_time if matching_time > 0 else 0\n",
" avg_time_per_mol = (matching_time / count) * 1000 # 转换为毫秒\n",
" \n",
" matching_times[count] = matching_time\n",
" \n",
" print(f\"{count}\\t\\t{matching_time:.3f}\\t\\t{speed:.1f}\\t\\t{avg_time_per_mol:.2f}\")\n",
" \n",
" return matching_times\n",
"\n",
"# 测试匹配性能\n",
"if 'test_file' in locals():\n",
" # 加载一些分子用于测试\n",
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
" test_molecules = [mol for mol in suppl if mol is not None][:10000]\n",
" \n",
" # 定义测试SMARTS模式\n",
" test_patterns = {\n",
" 'benzene': Chem.MolFromSmarts('c1ccccc1'),\n",
" 'alcohol': Chem.MolFromSmarts('[OX2H]'),\n",
" 'carboxylic_acid': Chem.MolFromSmarts('C(=O)O'),\n",
" 'amine': Chem.MolFromSmarts('[NX3;H2,H1;!$(NC=O)]')\n",
" }\n",
" \n",
" print(f\"\\n=== 匹配性能测试 (使用{len(test_patterns)}个SMARTS模式) ===\")\n",
" matching_times = test_matching_performance(test_molecules, test_patterns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. 并行vs串行性能对比"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def compare_parallel_vs_serial(molecules, patterns_dict, n_jobs_list=[1, 2, 4, 8, 16]):\n",
" \"\"\"比较不同并行度下的性能\"\"\"\n",
" \n",
" def match_single_molecule(mol, patterns):\n",
" matches = {}\n",
" for pattern_name, pattern in patterns.items():\n",
" try:\n",
" matches[pattern_name] = mol.HasSubstructMatch(pattern)\n",
" except:\n",
" matches[pattern_name] = False\n",
" return matches\n",
" \n",
" test_mols = molecules[:1000] # 使用1000个分子测试\n",
" results = {}\n",
" \n",
" print(f\"测试分子数量: {len(test_mols)}\")\n",
" print(f\"SMARTS模式数量: {len(patterns_dict)}\")\n",
" print(\"\\n并行度\\t时间(s)\\t速度(mol/s)\\t加速比\\t效率(%)\")\n",
" print(\"-\" * 65)\n",
" \n",
" serial_time = None\n",
" \n",
" for n_jobs in n_jobs_list:\n",
" start_time = time.time()\n",
" \n",
" if n_jobs == 1:\n",
" # 串行处理\n",
" for mol in test_mols:\n",
" match_single_molecule(mol, patterns_dict)\n",
" else:\n",
" # 并行处理\n",
" Parallel(n_jobs=n_jobs, backend='loky')(\n",
" delayed(match_single_molecule)(mol, patterns_dict) \n",
" for mol in test_mols\n",
" )\n",
" \n",
" end_time = time.time()\n",
" processing_time = end_time - start_time\n",
" speed = len(test_mols) / processing_time\n",
" \n",
" if serial_time is None:\n",
" serial_time = processing_time\n",
" speedup = 1.0\n",
" efficiency = 100.0\n",
" else:\n",
" speedup = serial_time / processing_time\n",
" efficiency = (speedup / n_jobs) * 100\n",
" \n",
" results[n_jobs] = {\n",
" 'time': processing_time,\n",
" 'speed': speed,\n",
" 'speedup': speedup,\n",
" 'efficiency': efficiency\n",
" }\n",
" \n",
" print(f\"{n_jobs}\\t\\t{processing_time:.3f}\\t\\t{speed:.1f}\\t\\t{speedup:.2f}x\\t{efficiency:.1f}\")\n",
" \n",
" return results\n",
"\n",
"# 运行并行vs串行对比\n",
"if 'test_molecules' in locals() and 'test_patterns' in locals():\n",
" print(\"\\n=== 并行vs串行性能对比 ===\")\n",
" max_jobs = min(16, int(multiprocessing.cpu_count() * 0.8))\n",
" n_jobs_list = [1, 2, 4, 8, max_jobs] if max_jobs > 8 else [1, 2, 4, max_jobs]\n",
" \n",
" parallel_results = compare_parallel_vs_serial(test_molecules, test_patterns, n_jobs_list)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. I/O瓶颈分析"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def analyze_io_bottleneck():\n",
" \"\"\"分析I/O瓶颈\"\"\"\n",
" \n",
" print(\"=== I/O瓶颈分析 ===\")\n",
" \n",
" # 测试文件读取速度\n",
" if 'test_file' in locals():\n",
" file_size = test_file.stat().st_size / (1024**2) # MB\n",
" \n",
" print(f\"\\n文件信息:\")\n",
" print(f\"文件大小: {file_size:.2f} MB\")\n",
" print(f\"文件路径: {test_file}\")\n",
" \n",
" # 测试纯文件读取速度\n",
" start_time = time.time()\n",
" with open(test_file, 'rb') as f:\n",
" raw_data = f.read()\n",
" raw_read_time = time.time() - start_time\n",
" raw_read_speed = file_size / raw_read_time\n",
" \n",
" print(f\"\\n纯文件读取:\")\n",
" print(f\"读取时间: {raw_read_time:.3f} s\")\n",
" print(f\"读取速度: {raw_read_speed:.2f} MB/s\")\n",
" \n",
" # 测试RDKit解析速度\n",
" start_time = time.time()\n",
" suppl = SDMolSupplier(str(test_file), sanitize=True)\n",
" molecules = [mol for mol in suppl if mol is not None]\n",
" parse_time = time.time() - start_time\n",
" parse_speed = len(molecules) / parse_time\n",
" \n",
" print(f\"\\nRDKit解析:\")\n",
" print(f\"解析时间: {parse_time:.3f} s\")\n",
" print(f\"分子数量: {len(molecules)}\")\n",
" print(f\"解析速度: {parse_speed:.1f} mol/s\")\n",
" \n",
" # 计算I/O占比\n",
" io_percentage = (raw_read_time / parse_time) * 100\n",
" print(f\"\\nI/O时间占比: {io_percentage:.1f}%\")\n",
" \n",
" if io_percentage > 50:\n",
" print(\"⚠️ I/O是主要瓶颈建议:\")\n",
" print(\" - 使用更快的存储(SSD)\")\n",
" print(\" - 预处理SDF文件为其他格式\")\n",
" print(\" - 使用内存映射\")\n",
" else:\n",
" print(\"✅ 计算是主要瓶颈,并行化有效\")\n",
"\n",
"analyze_io_bottleneck()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. 优化建议"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def provide_optimization_recommendations():\n",
" \"\"\"提供优化建议\"\"\"\n",
" \n",
" print(\"=== RDKit并行处理优化建议 ===\")\n",
" \n",
" recommendations = [\n",
" {\n",
" \"问题\": \"SDF文件读取慢\",\n",
" \"原因\": \"RDKit需要解析分子结构sanitization耗时\",\n",
" \"解决方案\": [\n",
" \"1. 预处理SDF为pickle/feather格式\",\n",
" \"2. 禁用不必要的sanitization\",\n",
" \"3. 使用更快的存储介质\",\n",
" \"4. 分批读取避免内存溢出\"\n",
" ]\n",
" },\n",
" {\n",
" \"问题\": \"并行效率低\",\n",
" \"原因\": \"RDKit的GIL限制和进程间通信开销\",\n",
" \"解决方案\": [\n",
" \"1. 使用多进程而非多线程\",\n",
" \"2. 增大每个任务的粒度\",\n",
" \"3. 预编译SMARTS模式\",\n",
" \"4. 使用loky backend减少开销\"\n",
" ]\n",
" },\n",
" {\n",
" \"问题\": \"内存使用过高\",\n",
" \"原因\": \"分子对象在内存中占用空间大\",\n",
" \"解决方案\": [\n",
" \"1. 分批处理\",\n",
" \"2. 及时释放不需要的分子\",\n",
" \"3. 使用生成器而非列表\",\n",
" \"4. 考虑使用SMILES字符串替代分子对象\"\n",
" ]\n",
" }\n",
" ]\n",
" \n",
" for i, rec in enumerate(recommendations, 1):\n",
" print(f\"\\n{i}. {rec['问题']}\")\n",
" print(f\" 原因: {rec['原因']}\")\n",
" print(f\" 解决方案:\")\n",
" for solution in rec['解决方案']:\n",
" print(f\" {solution}\")\n",
" \n",
" print(f\"\\n=== 最佳实践总结 ===\")\n",
" print(\"1. 🚀 对于大量小分子: 并行处理效果好\")\n",
" print(\"2. 📁 对于大文件: 预处理和分批读取更重要\")\n",
" print(\"3. 💾 内存受限: 使用流式处理和生成器\")\n",
" print(\"4. ⚡ CPU密集: 适当增加并行度,但避免过度并行\")\n",
" print(\"5. 🔄 I/O密集: 优化存储和文件格式\")\n",
"\n",
"provide_optimization_recommendations()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 结论\n",
"\n",
"### RDKit + Joblib 并行性分析:\n",
"\n",
"1. **✅ 可以并行**: RDKit的分子匹配是CPU密集型任务适合并行化\n",
"2. **⚠️ 有效但有局限**: 由于GIL和进程开销并行效率通常在60-80%\n",
"3. **📊 最佳并行度**: 通常为CPU核心数的50-80%\n",
"\n",
"### SDF读取性能:\n",
"\n",
"1. **🐌 相对较慢**: SDF解析需要分子sanitization比纯文本读取慢10-100倍\n",
"2. **💾 内存密集**: 每个分子对象在内存中占用较大空间\n",
"3. **🔄 I/O瓶颈**: 对于大文件I/O可能成为主要瓶颈\n",
"\n",
"### 推荐策略:\n",
"\n",
"- **小文件+多模式**: 并行处理效果好\n",
"- **大文件+少模式**: 优化I/O更重要\n",
"- **混合策略**: 预处理 + 并行匹配 + 分批处理"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "search_macro",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long