{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract SDF Files from ZIP Archives\n", "\n", "This notebook extracts all SDF files from ZIP archives and collects existing SDF files into a unified directory." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data directory: ../data\n", "Output directory: ../extracted_sdf_files\n" ] } ], "source": [ "import zipfile\n", "import rarfile\n", "import tarfile\n", "import gzip\n", "import shutil\n", "from pathlib import Path\n", "import os\n", "from tqdm import tqdm\n", "import pandas as pd\n", "\n", "# Set up paths\n", "data_dir = Path('../data')\n", "output_dir = Path('../extracted_sdf_files')\n", "output_dir.mkdir(exist_ok=True)\n", "\n", "print(f\"Data directory: {data_dir}\")\n", "print(f\"Output directory: {output_dir}\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Searching for compressed files and SDF files...\n", "Found 23 compressed files\n", "Found 29 existing SDF files\n", "\n", "Compressed files:\n", " 1. V1ePm5EdwFOnQFwMpOFJk.tar.gz (1.68 GB)\n", " 2. chemial_software.rar (0.05 GB)\n", " 3. 高性价比数据/Legancy 1740260 .zip (0.46 GB)\n", " 4. 高性价比数据/D009 Dec-2024/D009-1396k.zip (0.51 GB)\n", " 5. 高性价比数据/D011 Feb-2025/D011-576k.zip (0.21 GB)\n", " 6. 高性价比数据/D013 Feb-2025/D013-294641.zip (0.08 GB)\n", " 7. 高性价比数据/D111 Mar-2025/D111__439772.zip (0.18 GB)\n", " 8. 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip (0.64 GB)\n", " 9. part3-1400w/D013 Feb-2025/D013-294641.zip (0.08 GB)\n", " 10. part3-1400w/D015 jan-2023/D015.zip (0.31 GB)\n", " ... and 13 more files\n" ] } ], "source": [ "# Find all compressed files and existing SDF files\n", "compressed_files = []\n", "existing_sdf_files = []\n", "\n", "# Define file extensions for compressed files\n", "compressed_extensions = {'.zip', '.rar', '.tar.gz', '.tgz', '.gz'}\n", "sdf_extensions = {'.sdf', '.mol', '.sd'}\n", "\n", "print(\"Searching for compressed files and SDF files...\")\n", "for file_path in data_dir.rglob('*'):\n", " if file_path.is_file():\n", " if file_path.suffix.lower() in compressed_extensions or \\\n", " ''.join(file_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n", " compressed_files.append(file_path)\n", " elif file_path.suffix.lower() in sdf_extensions:\n", " existing_sdf_files.append(file_path)\n", "\n", "print(f\"Found {len(compressed_files)} compressed files\")\n", "print(f\"Found {len(existing_sdf_files)} existing SDF files\")\n", "\n", "# Display compressed files\n", "print(\"\\nCompressed files:\")\n", "for i, file in enumerate(compressed_files[:10]): # Show first 10\n", " print(f\" {i+1}. {file.relative_to(data_dir)} ({file.stat().st_size / (1024**3):.2f} GB)\")\n", "if len(compressed_files) > 10:\n", " print(f\" ... and {len(compressed_files) - 10} more files\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def extract_sdf_from_archive(archive_path, extract_to):\n", " \"\"\"Extract SDF files from various archive formats.\"\"\"\n", " extracted_files = []\n", " \n", " try:\n", " if archive_path.suffix.lower() == '.zip':\n", " with zipfile.ZipFile(archive_path, 'r') as zip_ref:\n", " for file_info in zip_ref.filelist:\n", " if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n", " # Extract with full path structure preserved\n", " extracted_path = zip_ref.extract(file_info, extract_to)\n", " extracted_files.append(Path(extracted_path))\n", " \n", " elif archive_path.suffix.lower() == '.rar':\n", " with rarfile.RarFile(archive_path, 'r') as rar_ref:\n", " for file_info in rar_ref.infolist():\n", " if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n", " extracted_path = rar_ref.extract(file_info, extract_to)\n", " extracted_files.append(Path(extracted_path))\n", " \n", " elif archive_path.suffix.lower() in {'.gz', '.tgz'} or ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n", " if ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n", " with tarfile.open(archive_path, 'r:gz') as tar_ref:\n", " for member in tar_ref.getmembers():\n", " if member.name.lower().endswith(('.sdf', '.mol', '.sd')):\n", " extracted_path = tar_ref.extract(member, extract_to)\n", " extracted_files.append(Path(extracted_path))\n", " else:\n", " # Single gzip file\n", " output_path = extract_to / archive_path.stem\n", " if output_path.suffix.lower() in {'.sdf', '.mol', '.sd'}:\n", " with gzip.open(archive_path, 'rb') as gz_file:\n", " with open(output_path, 'wb') as out_file:\n", " shutil.copyfileobj(gz_file, out_file)\n", " extracted_files.append(output_path)\n", " \n", " except Exception as e:\n", " print(f\"Error extracting {archive_path}: {e}\")\n", " \n", " return extracted_files" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting SDF files from compressed archives...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Extracting archives: 4%|▍ | 1/23 [00:03<01:22, 3.76s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error extracting ../data/V1ePm5EdwFOnQFwMpOFJk.tar.gz: Compressed file ended before the end-of-stream marker was reached\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Extracting archives: 61%|██████ | 14/23 [01:51<01:53, 12.64s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error extracting ../data/part3-1400w/D133 may-2023/D133.rar: Cannot find working tool\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Extracting archives: 100%|██████████| 23/23 [04:47<00:00, 12.49s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Error extracting ../data/part2-845w/D012 may-2023/D012-53344cpds.rar: Cannot find working tool\n", "\n", "Extracted 85 SDF files from archives\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Extract SDF files from all compressed archives\n", "extracted_sdf_files = []\n", "extraction_log = []\n", "\n", "print(\"Extracting SDF files from compressed archives...\")\n", "for archive_path in tqdm(compressed_files, desc=\"Extracting archives\"):\n", " # Create a subdirectory for each archive to maintain organization\n", " archive_extract_dir = output_dir / f\"extracted_{archive_path.stem}\"\n", " archive_extract_dir.mkdir(exist_ok=True)\n", " \n", " extracted = extract_sdf_from_archive(archive_path, archive_extract_dir)\n", " extracted_sdf_files.extend(extracted)\n", " \n", " extraction_log.append({\n", " 'archive': str(archive_path.relative_to(data_dir)),\n", " 'extracted_files': len(extracted),\n", " 'extract_dir': str(archive_extract_dir.relative_to(output_dir))\n", " })\n", "\n", "print(f\"\\nExtracted {len(extracted_sdf_files)} SDF files from archives\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Copying existing SDF files...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Copying SDF files: 100%|██████████| 29/29 [01:50<00:00, 3.83s/it]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Copied 29 existing SDF files\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# Copy existing SDF files to the output directory\n", "copied_sdf_files = []\n", "\n", "print(\"Copying existing SDF files...\")\n", "for sdf_path in tqdm(existing_sdf_files, desc=\"Copying SDF files\"):\n", " # Create a subdirectory to maintain original path structure\n", " relative_path = sdf_path.relative_to(data_dir)\n", " destination = output_dir / \"existing\" / relative_path\n", " destination.parent.mkdir(parents=True, exist_ok=True)\n", " \n", " try:\n", " shutil.copy2(sdf_path, destination)\n", " copied_sdf_files.append(destination)\n", " except Exception as e:\n", " print(f\"Error copying {sdf_path}: {e}\")\n", "\n", "print(f\"Copied {len(copied_sdf_files)} existing SDF files\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total SDF files available: 110\n", "Total size: 90.10 GB\n", "\n", "Extraction Summary:\n", " archive extracted_files \\\n", "0 V1ePm5EdwFOnQFwMpOFJk.tar.gz 0 \n", "1 chemial_software.rar 0 \n", "2 高性价比数据/Legancy 1740260 .zip 4 \n", "3 高性价比数据/D009 Dec-2024/D009-1396k.zip 3 \n", "4 高性价比数据/D011 Feb-2025/D011-576k.zip 2 \n", "5 高性价比数据/D013 Feb-2025/D013-294641.zip 1 \n", "6 高性价比数据/D111 Mar-2025/D111__439772.zip 3 \n", "7 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip 4 \n", "8 part3-1400w/D013 Feb-2025/D013-294641.zip 1 \n", "9 part3-1400w/D015 jan-2023/D015.zip 6 \n", "10 part3-1400w/D021 apr-2022/D021.zip 8 \n", "11 part3-1400w/D058 Mar-2025/D058-718205.zip 3 \n", "12 part3-1400w/D062 june-2024/D062.zip 4 \n", "13 part3-1400w/D111 Mar-2025/D111__439772.zip 3 \n", "14 part3-1400w/D133 may-2023/D133.rar 0 \n", "15 part3-1400w/D140 Mar-2025/D140-370312.zip 2 \n", "16 part3-1400w/D144 Oct-2023/D144-2023.zip 12 \n", "17 part3-1400w/D147 May-2024/D147-5736K.zip 12 \n", "18 part2-845w/D003 May-2025/D003-legacy 1738861 .zip 3 \n", "19 part2-845w/D003 May-2025/D003_4573K.zip 10 \n", "20 part2-845w/D010 Feb-2025/D010-428074.zip 3 \n", "21 part2-845w/D011 April-2025/D011-582106.zip 1 \n", "22 part2-845w/D012 may-2023/D012-53344cpds.rar 0 \n", "\n", " extract_dir \n", "0 extracted_V1ePm5EdwFOnQFwMpOFJk.tar \n", "1 extracted_chemial_software \n", "2 extracted_Legancy 1740260 \n", "3 extracted_D009-1396k \n", "4 extracted_D011-576k \n", "5 extracted_D013-294641 \n", "6 extracted_D111__439772 \n", "7 extracted_D001-1614k \n", "8 extracted_D013-294641 \n", "9 extracted_D015 \n", "10 extracted_D021 \n", "11 extracted_D058-718205 \n", "12 extracted_D062 \n", "13 extracted_D111__439772 \n", "14 extracted_D133 \n", "15 extracted_D140-370312 \n", "16 extracted_D144-2023 \n", "17 extracted_D147-5736K \n", "18 extracted_D003-legacy 1738861 \n", "19 extracted_D003_4573K \n", "20 extracted_D010-428074 \n", "21 extracted_D011-582106 \n", "22 extracted_D012-53344cpds \n", "\n", "Saved SDF file list to: ../extracted_sdf_files/sdf_file_list.csv\n", "\n", "Largest SDF files:\n", " relative_path size_mb\n", "65 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1921.503638\n", "81 existing/part3-1400w/D065 Mar-2025/D065-646891... 1830.950122\n", "64 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1796.318578\n", "66 existing/part1-165w/D001 Aug-2025/D001-1550k/D... 1793.656826\n", "45 extracted_D003-legacy 1738861 /D003_legacy_2_6... 1699.557550\n", "50 extracted_D003_4573K/D003_legacy_2_600000.sdf 1699.557550\n", "60 extracted_D011-582106/D011-582106.sdf 1646.037828\n", "44 extracted_D003-legacy 1738861 /D003_legacy_1_6... 1604.531812\n", "49 extracted_D003_4573K/D003_legacy_1_600000.sdf 1604.531812\n", "71 existing/part2-845w/D006 Sep-2025/D006-1697617... 1603.250914\n" ] } ], "source": [ "# Create a comprehensive list of all SDF files\n", "all_sdf_files = list(output_dir.rglob('*.sdf')) + list(output_dir.rglob('*.mol')) + list(output_dir.rglob('*.sd'))\n", "\n", "print(f\"Total SDF files available: {len(all_sdf_files)}\")\n", "\n", "# Create summary statistics\n", "total_size = sum(f.stat().st_size for f in all_sdf_files)\n", "print(f\"Total size: {total_size / (1024**3):.2f} GB\")\n", "\n", "# Display extraction log\n", "extraction_df = pd.DataFrame(extraction_log)\n", "print(\"\\nExtraction Summary:\")\n", "print(extraction_df)\n", "\n", "# Save the list of all SDF files\n", "sdf_file_list = []\n", "for sdf_file in all_sdf_files:\n", " sdf_file_list.append({\n", " 'file_path': str(sdf_file),\n", " 'relative_path': str(sdf_file.relative_to(output_dir)),\n", " 'size_bytes': sdf_file.stat().st_size,\n", " 'size_mb': sdf_file.stat().st_size / (1024**2)\n", " })\n", "\n", "sdf_df = pd.DataFrame(sdf_file_list)\n", "sdf_df.to_csv(output_dir / 'sdf_file_list.csv', index=False)\n", "print(f\"\\nSaved SDF file list to: {output_dir / 'sdf_file_list.csv'}\")\n", "\n", "# Display largest files\n", "print(\"\\nLargest SDF files:\")\n", "print(sdf_df.nlargest(10, 'size_mb')[['relative_path', 'size_mb']])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Testing SDF file readability...\n", "✓ D003-4_l 460000.sdf: 460000 molecules\n", "✓ D003-5_l 460000.sdf: 460000 molecules\n", "✓ D003-6_l 460000.sdf: 460000 molecules\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[21:48:17] Explicit valence for atom # 20 B, 4, is greater than permitted\n", "[21:48:17] ERROR: Could not sanitize molecule ending on line 35871530\n", "[21:48:17] ERROR: Explicit valence for atom # 20 B, 4, is greater than permitted\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✓ D003-7_l 360260.sdf: 360259 molecules\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[21:49:06] Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:06] ERROR: Could not sanitize molecule ending on line 7897106\n", "[21:49:06] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n", "[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 8 ignored\n", "[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n", "[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 14 ignored\n", "[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:49:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:49:14] ERROR: Could not sanitize molecule ending on line 11196260\n", "[21:49:14] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:49:20] Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[21:49:20] ERROR: Could not sanitize molecule ending on line 13936884\n", "[21:49:20] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[21:49:23] Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:23] ERROR: Could not sanitize molecule ending on line 15981200\n", "[21:49:23] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:27] Explicit valence for atom # 6 O, 3, is greater than permitted\n", "[21:49:27] ERROR: Could not sanitize molecule ending on line 18013023\n", "[21:49:27] ERROR: Explicit valence for atom # 6 O, 3, is greater than permitted\n", "[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:49:47] Explicit valence for atom # 7 N, 4, is greater than permitted\n", "[21:49:47] ERROR: Could not sanitize molecule ending on line 30340115\n", "[21:49:47] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted\n", "[21:49:47] Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[21:49:47] ERROR: Could not sanitize molecule ending on line 30341543\n", "[21:49:47] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n", "[21:49:47] Explicit valence for atom # 2 C, 5, is greater than permitted\n", "[21:49:47] ERROR: Could not sanitize molecule ending on line 30419985\n", "[21:49:47] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted\n", "[21:49:53] Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:53] ERROR: Could not sanitize molecule ending on line 33667204\n", "[21:49:53] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:54] Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:54] ERROR: Could not sanitize molecule ending on line 34387487\n", "[21:49:54] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n", "[21:49:56] Explicit valence for atom # 3 N, 4, is greater than permitted\n", "[21:49:56] ERROR: Could not sanitize molecule ending on line 35795575\n", "[21:49:56] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n", "[21:49:57] Explicit valence for atom # 1 N, 4, is greater than permitted\n", "[21:49:57] ERROR: Could not sanitize molecule ending on line 36061075\n", "[21:49:57] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted\n", "[21:49:57] Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:49:57] ERROR: Could not sanitize molecule ending on line 36061540\n", "[21:49:57] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:49:57] Explicit valence for atom # 3 N, 4, is greater than permitted\n", "[21:49:57] ERROR: Could not sanitize molecule ending on line 36064097\n", "[21:49:57] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n", "[21:50:05] Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:50:05] ERROR: Could not sanitize molecule ending on line 41484117\n", "[21:50:05] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n", "[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n", "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n", "[21:50:08] ERROR: Could not sanitize molecule ending on line 43160926\n", "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n", "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n", "[21:50:08] ERROR: Could not sanitize molecule ending on line 43161491\n", "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n", "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n", "[21:50:08] ERROR: Could not sanitize molecule ending on line 43357350\n", "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "✓ D009_part1_474930.sdf: 474912 molecules\n", "\n", "Extraction complete! Ready for substructure matching.\n" ] } ], "source": [ "# Verify a few SDF files can be read\n", "from rdkit import Chem\n", "from rdkit.Chem import SDMolSupplier\n", "\n", "print(\"Testing SDF file readability...\")\n", "test_files = all_sdf_files[:5] # Test first 5 files\n", "\n", "for sdf_file in test_files:\n", " try:\n", " suppl = SDMolSupplier(str(sdf_file))\n", " mols = [mol for mol in suppl if mol is not None]\n", " print(f\"✓ {sdf_file.name}: {len(mols)} molecules\")\n", " except Exception as e:\n", " print(f\"✗ {sdf_file.name}: Error - {e}\")\n", "\n", "print(\"\\nExtraction complete! Ready for substructure matching.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "default", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.14.0" } }, "nbformat": 4, "nbformat_minor": 4 }