search_macro/notebooks/01_extract_sdf_files.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract SDF Files from ZIP Archives\n",
    "\n",
    "This notebook extracts all SDF files from ZIP archives and collects existing SDF files into a unified directory."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data directory: ../data\n",
      "Output directory: ../extracted_sdf_files\n"
     ]
    }
   ],
   "source": [
    "import zipfile\n",
    "import rarfile\n",
    "import tarfile\n",
    "import gzip\n",
    "import shutil\n",
    "from pathlib import Path\n",
    "import os\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "\n",
    "# Set up paths\n",
    "data_dir = Path('../data')\n",
    "output_dir = Path('../extracted_sdf_files')\n",
    "output_dir.mkdir(exist_ok=True)\n",
    "\n",
    "print(f\"Data directory: {data_dir}\")\n",
    "print(f\"Output directory: {output_dir}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Searching for compressed files and SDF files...\n",
      "Found 23 compressed files\n",
      "Found 29 existing SDF files\n",
      "\n",
      "Compressed files:\n",
      "  1. V1ePm5EdwFOnQFwMpOFJk.tar.gz (1.68 GB)\n",
      "  2. chemial_software.rar (0.05 GB)\n",
      "  3. 高性价比数据/Legancy 1740260 .zip (0.46 GB)\n",
      "  4. 高性价比数据/D009 Dec-2024/D009-1396k.zip (0.51 GB)\n",
      "  5. 高性价比数据/D011 Feb-2025/D011-576k.zip (0.21 GB)\n",
      "  6. 高性价比数据/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
      "  7. 高性价比数据/D111 Mar-2025/D111__439772.zip (0.18 GB)\n",
      "  8. 高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip (0.64 GB)\n",
      "  9. part3-1400w/D013 Feb-2025/D013-294641.zip (0.08 GB)\n",
      "  10. part3-1400w/D015 jan-2023/D015.zip (0.31 GB)\n",
      "  ... and 13 more files\n"
     ]
    }
   ],
   "source": [
    "# Find all compressed files and existing SDF files\n",
    "compressed_files = []\n",
    "existing_sdf_files = []\n",
    "\n",
    "# Define file extensions for compressed files\n",
    "compressed_extensions = {'.zip', '.rar', '.tar.gz', '.tgz', '.gz'}\n",
    "sdf_extensions = {'.sdf', '.mol', '.sd'}\n",
    "\n",
    "print(\"Searching for compressed files and SDF files...\")\n",
    "for file_path in data_dir.rglob('*'):\n",
    "    if file_path.is_file():\n",
    "        if file_path.suffix.lower() in compressed_extensions or \\\n",
    "           ''.join(file_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
    "            compressed_files.append(file_path)\n",
    "        elif file_path.suffix.lower() in sdf_extensions:\n",
    "            existing_sdf_files.append(file_path)\n",
    "\n",
    "print(f\"Found {len(compressed_files)} compressed files\")\n",
    "print(f\"Found {len(existing_sdf_files)} existing SDF files\")\n",
    "\n",
    "# Display compressed files\n",
    "print(\"\\nCompressed files:\")\n",
    "for i, file in enumerate(compressed_files[:10]):  # Show first 10\n",
    "    print(f\"  {i+1}. {file.relative_to(data_dir)} ({file.stat().st_size / (1024**3):.2f} GB)\")\n",
    "if len(compressed_files) > 10:\n",
    "    print(f\"  ... and {len(compressed_files) - 10} more files\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_sdf_from_archive(archive_path, extract_to):\n",
    "    \"\"\"Extract SDF files from various archive formats.\"\"\"\n",
    "    extracted_files = []\n",
    "    \n",
    "    try:\n",
    "        if archive_path.suffix.lower() == '.zip':\n",
    "            with zipfile.ZipFile(archive_path, 'r') as zip_ref:\n",
    "                for file_info in zip_ref.filelist:\n",
    "                    if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
    "                        # Extract with full path structure preserved\n",
    "                        extracted_path = zip_ref.extract(file_info, extract_to)\n",
    "                        extracted_files.append(Path(extracted_path))\n",
    "        \n",
    "        elif archive_path.suffix.lower() == '.rar':\n",
    "            with rarfile.RarFile(archive_path, 'r') as rar_ref:\n",
    "                for file_info in rar_ref.infolist():\n",
    "                    if file_info.filename.lower().endswith(('.sdf', '.mol', '.sd')):\n",
    "                        extracted_path = rar_ref.extract(file_info, extract_to)\n",
    "                        extracted_files.append(Path(extracted_path))\n",
    "        \n",
    "        elif archive_path.suffix.lower() in {'.gz', '.tgz'} or ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
    "            if ''.join(archive_path.suffixes).lower() in {'.tar.gz', '.tgz'}:\n",
    "                with tarfile.open(archive_path, 'r:gz') as tar_ref:\n",
    "                    for member in tar_ref.getmembers():\n",
    "                        if member.name.lower().endswith(('.sdf', '.mol', '.sd')):\n",
    "                            extracted_path = tar_ref.extract(member, extract_to)\n",
    "                            extracted_files.append(Path(extracted_path))\n",
    "            else:\n",
    "                # Single gzip file\n",
    "                output_path = extract_to / archive_path.stem\n",
    "                if output_path.suffix.lower() in {'.sdf', '.mol', '.sd'}:\n",
    "                    with gzip.open(archive_path, 'rb') as gz_file:\n",
    "                        with open(output_path, 'wb') as out_file:\n",
    "                            shutil.copyfileobj(gz_file, out_file)\n",
    "                    extracted_files.append(output_path)\n",
    "    \n",
    "    except Exception as e:\n",
    "        print(f\"Error extracting {archive_path}: {e}\")\n",
    "    \n",
    "    return extracted_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracting SDF files from compressed archives...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Extracting archives:   4%|▍         | 1/23 [00:03<01:22,  3.76s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error extracting ../data/V1ePm5EdwFOnQFwMpOFJk.tar.gz: Compressed file ended before the end-of-stream marker was reached\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Extracting archives:  61%|██████    | 14/23 [01:51<01:53, 12.64s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error extracting ../data/part3-1400w/D133 may-2023/D133.rar: Cannot find working tool\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Extracting archives: 100%|██████████| 23/23 [04:47<00:00, 12.49s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Error extracting ../data/part2-845w/D012 may-2023/D012-53344cpds.rar: Cannot find working tool\n",
      "\n",
      "Extracted 85 SDF files from archives\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Extract SDF files from all compressed archives\n",
    "extracted_sdf_files = []\n",
    "extraction_log = []\n",
    "\n",
    "print(\"Extracting SDF files from compressed archives...\")\n",
    "for archive_path in tqdm(compressed_files, desc=\"Extracting archives\"):\n",
    "    # Create a subdirectory for each archive to maintain organization\n",
    "    archive_extract_dir = output_dir / f\"extracted_{archive_path.stem}\"\n",
    "    archive_extract_dir.mkdir(exist_ok=True)\n",
    "    \n",
    "    extracted = extract_sdf_from_archive(archive_path, archive_extract_dir)\n",
    "    extracted_sdf_files.extend(extracted)\n",
    "    \n",
    "    extraction_log.append({\n",
    "        'archive': str(archive_path.relative_to(data_dir)),\n",
    "        'extracted_files': len(extracted),\n",
    "        'extract_dir': str(archive_extract_dir.relative_to(output_dir))\n",
    "    })\n",
    "\n",
    "print(f\"\\nExtracted {len(extracted_sdf_files)} SDF files from archives\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Copying existing SDF files...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Copying SDF files: 100%|██████████| 29/29 [01:50<00:00,  3.83s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Copied 29 existing SDF files\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# Copy existing SDF files to the output directory\n",
    "copied_sdf_files = []\n",
    "\n",
    "print(\"Copying existing SDF files...\")\n",
    "for sdf_path in tqdm(existing_sdf_files, desc=\"Copying SDF files\"):\n",
    "    # Create a subdirectory to maintain original path structure\n",
    "    relative_path = sdf_path.relative_to(data_dir)\n",
    "    destination = output_dir / \"existing\" / relative_path\n",
    "    destination.parent.mkdir(parents=True, exist_ok=True)\n",
    "    \n",
    "    try:\n",
    "        shutil.copy2(sdf_path, destination)\n",
    "        copied_sdf_files.append(destination)\n",
    "    except Exception as e:\n",
    "        print(f\"Error copying {sdf_path}: {e}\")\n",
    "\n",
    "print(f\"Copied {len(copied_sdf_files)} existing SDF files\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total SDF files available: 110\n",
      "Total size: 90.10 GB\n",
      "\n",
      "Extraction Summary:\n",
      "                                              archive  extracted_files  \\\n",
      "0                        V1ePm5EdwFOnQFwMpOFJk.tar.gz                0   \n",
      "1                                chemial_software.rar                0   \n",
      "2                         高性价比数据/Legancy 1740260 .zip                4   \n",
      "3                 高性价比数据/D009 Dec-2024/D009-1396k.zip                3   \n",
      "4                  高性价比数据/D011 Feb-2025/D011-576k.zip                2   \n",
      "5                高性价比数据/D013 Feb-2025/D013-294641.zip                1   \n",
      "6               高性价比数据/D111 Mar-2025/D111__439772.zip                3   \n",
      "7    高性价比数据/1-165万种核心数据库/D001 Feb-2025/D001-1614k.zip                4   \n",
      "8           part3-1400w/D013 Feb-2025/D013-294641.zip                1   \n",
      "9                  part3-1400w/D015 jan-2023/D015.zip                6   \n",
      "10                 part3-1400w/D021 apr-2022/D021.zip                8   \n",
      "11          part3-1400w/D058 Mar-2025/D058-718205.zip                3   \n",
      "12                part3-1400w/D062 june-2024/D062.zip                4   \n",
      "13         part3-1400w/D111 Mar-2025/D111__439772.zip                3   \n",
      "14                 part3-1400w/D133 may-2023/D133.rar                0   \n",
      "15          part3-1400w/D140 Mar-2025/D140-370312.zip                2   \n",
      "16            part3-1400w/D144 Oct-2023/D144-2023.zip               12   \n",
      "17           part3-1400w/D147 May-2024/D147-5736K.zip               12   \n",
      "18  part2-845w/D003 May-2025/D003-legacy 1738861 .zip                3   \n",
      "19            part2-845w/D003 May-2025/D003_4573K.zip               10   \n",
      "20           part2-845w/D010 Feb-2025/D010-428074.zip                3   \n",
      "21         part2-845w/D011 April-2025/D011-582106.zip                1   \n",
      "22        part2-845w/D012 may-2023/D012-53344cpds.rar                0   \n",
      "\n",
      "                            extract_dir  \n",
      "0   extracted_V1ePm5EdwFOnQFwMpOFJk.tar  \n",
      "1            extracted_chemial_software  \n",
      "2            extracted_Legancy 1740260   \n",
      "3                  extracted_D009-1396k  \n",
      "4                   extracted_D011-576k  \n",
      "5                 extracted_D013-294641  \n",
      "6                extracted_D111__439772  \n",
      "7                  extracted_D001-1614k  \n",
      "8                 extracted_D013-294641  \n",
      "9                        extracted_D015  \n",
      "10                       extracted_D021  \n",
      "11                extracted_D058-718205  \n",
      "12                       extracted_D062  \n",
      "13               extracted_D111__439772  \n",
      "14                       extracted_D133  \n",
      "15                extracted_D140-370312  \n",
      "16                  extracted_D144-2023  \n",
      "17                 extracted_D147-5736K  \n",
      "18       extracted_D003-legacy 1738861   \n",
      "19                 extracted_D003_4573K  \n",
      "20                extracted_D010-428074  \n",
      "21                extracted_D011-582106  \n",
      "22             extracted_D012-53344cpds  \n",
      "\n",
      "Saved SDF file list to: ../extracted_sdf_files/sdf_file_list.csv\n",
      "\n",
      "Largest SDF files:\n",
      "                                        relative_path      size_mb\n",
      "65  existing/part1-165w/D001 Aug-2025/D001-1550k/D...  1921.503638\n",
      "81  existing/part3-1400w/D065 Mar-2025/D065-646891...  1830.950122\n",
      "64  existing/part1-165w/D001 Aug-2025/D001-1550k/D...  1796.318578\n",
      "66  existing/part1-165w/D001 Aug-2025/D001-1550k/D...  1793.656826\n",
      "45  extracted_D003-legacy 1738861 /D003_legacy_2_6...  1699.557550\n",
      "50      extracted_D003_4573K/D003_legacy_2_600000.sdf  1699.557550\n",
      "60              extracted_D011-582106/D011-582106.sdf  1646.037828\n",
      "44  extracted_D003-legacy 1738861 /D003_legacy_1_6...  1604.531812\n",
      "49      extracted_D003_4573K/D003_legacy_1_600000.sdf  1604.531812\n",
      "71  existing/part2-845w/D006 Sep-2025/D006-1697617...  1603.250914\n"
     ]
    }
   ],
   "source": [
    "# Create a comprehensive list of all SDF files\n",
    "all_sdf_files = list(output_dir.rglob('*.sdf')) + list(output_dir.rglob('*.mol')) + list(output_dir.rglob('*.sd'))\n",
    "\n",
    "print(f\"Total SDF files available: {len(all_sdf_files)}\")\n",
    "\n",
    "# Create summary statistics\n",
    "total_size = sum(f.stat().st_size for f in all_sdf_files)\n",
    "print(f\"Total size: {total_size / (1024**3):.2f} GB\")\n",
    "\n",
    "# Display extraction log\n",
    "extraction_df = pd.DataFrame(extraction_log)\n",
    "print(\"\\nExtraction Summary:\")\n",
    "print(extraction_df)\n",
    "\n",
    "# Save the list of all SDF files\n",
    "sdf_file_list = []\n",
    "for sdf_file in all_sdf_files:\n",
    "    sdf_file_list.append({\n",
    "        'file_path': str(sdf_file),\n",
    "        'relative_path': str(sdf_file.relative_to(output_dir)),\n",
    "        'size_bytes': sdf_file.stat().st_size,\n",
    "        'size_mb': sdf_file.stat().st_size / (1024**2)\n",
    "    })\n",
    "\n",
    "sdf_df = pd.DataFrame(sdf_file_list)\n",
    "sdf_df.to_csv(output_dir / 'sdf_file_list.csv', index=False)\n",
    "print(f\"\\nSaved SDF file list to: {output_dir / 'sdf_file_list.csv'}\")\n",
    "\n",
    "# Display largest files\n",
    "print(\"\\nLargest SDF files:\")\n",
    "print(sdf_df.nlargest(10, 'size_mb')[['relative_path', 'size_mb']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing SDF file readability...\n",
      "✓ D003-4_l 460000.sdf: 460000 molecules\n",
      "✓ D003-5_l 460000.sdf: 460000 molecules\n",
      "✓ D003-6_l 460000.sdf: 460000 molecules\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[21:48:17] Explicit valence for atom # 20 B, 4, is greater than permitted\n",
      "[21:48:17] ERROR: Could not sanitize molecule ending on line 35871530\n",
      "[21:48:17] ERROR: Explicit valence for atom # 20 B, 4, is greater than permitted\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✓ D003-7_l 360260.sdf: 360259 molecules\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[21:49:06] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:06] ERROR: Could not sanitize molecule ending on line 7897106\n",
      "[21:49:06] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
      "[21:49:11] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 8 ignored\n",
      "[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 12 ignored\n",
      "[21:49:12] Warning: ambiguous stereochemistry - zero final chiral volume - at atom 14 ignored\n",
      "[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:49:13] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:49:14] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:49:14] ERROR: Could not sanitize molecule ending on line 11196260\n",
      "[21:49:14] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:49:20] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
      "[21:49:20] ERROR: Could not sanitize molecule ending on line 13936884\n",
      "[21:49:20] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
      "[21:49:23] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:23] ERROR: Could not sanitize molecule ending on line 15981200\n",
      "[21:49:23] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:27] Explicit valence for atom # 6 O, 3, is greater than permitted\n",
      "[21:49:27] ERROR: Could not sanitize molecule ending on line 18013023\n",
      "[21:49:27] ERROR: Explicit valence for atom # 6 O, 3, is greater than permitted\n",
      "[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:49:37] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:49:47] Explicit valence for atom # 7 N, 4, is greater than permitted\n",
      "[21:49:47] ERROR: Could not sanitize molecule ending on line 30340115\n",
      "[21:49:47] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted\n",
      "[21:49:47] Explicit valence for atom # 5 N, 4, is greater than permitted\n",
      "[21:49:47] ERROR: Could not sanitize molecule ending on line 30341543\n",
      "[21:49:47] ERROR: Explicit valence for atom # 5 N, 4, is greater than permitted\n",
      "[21:49:47] Explicit valence for atom # 2 C, 5, is greater than permitted\n",
      "[21:49:47] ERROR: Could not sanitize molecule ending on line 30419985\n",
      "[21:49:47] ERROR: Explicit valence for atom # 2 C, 5, is greater than permitted\n",
      "[21:49:53] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:53] ERROR: Could not sanitize molecule ending on line 33667204\n",
      "[21:49:53] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:54] Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:54] ERROR: Could not sanitize molecule ending on line 34387487\n",
      "[21:49:54] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted\n",
      "[21:49:56] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
      "[21:49:56] ERROR: Could not sanitize molecule ending on line 35795575\n",
      "[21:49:56] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
      "[21:49:57] Explicit valence for atom # 1 N, 4, is greater than permitted\n",
      "[21:49:57] ERROR: Could not sanitize molecule ending on line 36061075\n",
      "[21:49:57] ERROR: Explicit valence for atom # 1 N, 4, is greater than permitted\n",
      "[21:49:57] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:49:57] ERROR: Could not sanitize molecule ending on line 36061540\n",
      "[21:49:57] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:49:57] Explicit valence for atom # 3 N, 4, is greater than permitted\n",
      "[21:49:57] ERROR: Could not sanitize molecule ending on line 36064097\n",
      "[21:49:57] ERROR: Explicit valence for atom # 3 N, 4, is greater than permitted\n",
      "[21:50:05] Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:50:05] ERROR: Could not sanitize molecule ending on line 41484117\n",
      "[21:50:05] ERROR: Explicit valence for atom # 6 N, 4, is greater than permitted\n",
      "[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:50:06] Warning: ambiguous stereochemistry - opposing bonds have opposite wedging - at atom 1 ignored.\n",
      "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
      "[21:50:08] ERROR: Could not sanitize molecule ending on line 43160926\n",
      "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
      "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
      "[21:50:08] ERROR: Could not sanitize molecule ending on line 43161491\n",
      "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n",
      "[21:50:08] Explicit valence for atom # 2 N, 4, is greater than permitted\n",
      "[21:50:08] ERROR: Could not sanitize molecule ending on line 43357350\n",
      "[21:50:08] ERROR: Explicit valence for atom # 2 N, 4, is greater than permitted\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✓ D009_part1_474930.sdf: 474912 molecules\n",
      "\n",
      "Extraction complete! Ready for substructure matching.\n"
     ]
    }
   ],
   "source": [
    "# Verify a few SDF files can be read\n",
    "from rdkit import Chem\n",
    "from rdkit.Chem import SDMolSupplier\n",
    "\n",
    "print(\"Testing SDF file readability...\")\n",
    "test_files = all_sdf_files[:5]  # Test first 5 files\n",
    "\n",
    "for sdf_file in test_files:\n",
    "    try:\n",
    "        suppl = SDMolSupplier(str(sdf_file))\n",
    "        mols = [mol for mol in suppl if mol is not None]\n",
    "        print(f\"✓ {sdf_file.name}: {len(mols)} molecules\")\n",
    "    except Exception as e:\n",
    "        print(f\"✗ {sdf_file.name}: Error - {e}\")\n",
    "\n",
    "print(\"\\nExtraction complete! Ready for substructure matching.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "default",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.14.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}