first add

2025-09-17 22:14:05 +08:00
commit 64d80d2e3c
9 changed files with 1534 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 test/
--- a/assets/ring16.smi
+++ b/assets/ring16.smi
@@ -0,0 +1 @@
 CC[C@H]1OC(=O)[*][*sugar*][C@H](C)[*sugar*][*]C[*]C(=O)[*][C@@H](O)[*]1
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,40 @@
 FROM mambaorg/micromamba:cuda12.4.1-ubuntu22.04
 ARG ROOT_PASSWD="micromamba"
 ENV ROOT_PASSWD=${ROOT_PASSWD}
 USER root
 RUN <<EOT
 #!/bin/bash
 apt update
 apt install vim git sudo bash-completion wget curl htop jq bzip2 npm unzip libz-dev libgomp1 libblas-dev liblapack-dev libzmq3-dev -y
 echo "mambauser ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers > /dev/null
 echo "mambauser:${ROOT_PASSWD}" | chpasswd
 EOT
 USER mambauser
 ENV LILLYMOL_HOME=/home/mambauser/LillyMol \
    BUILD_DIR=Linux \
    BUILD_BDB=1 \
    BUILD_PYTHON=1
 RUN <<EOT
 #!/bin/bash
 echo "alias conda=micromamba" >> /home/mambauser/.bashrc
 echo "alias mamba=micromamba" >> /home/mambauser/.bashrc
 micromamba create -n lillymol python=3.10 pybind11 absl-py protobuf pandas scipy -y
 echo "mamba activate lillymol" >> /home/mambauser/.bashrc
 git clone https://github.com/IanAWatson/LillyMol /home/mambauser/LillyMol
 EOT
 WORKDIR /home/mambauser/LillyMol
 RUN <<EOT
 #!/bin/bash
 npm install -g @bazel/bazelisk
 cd src
 bash ./build_third_party.sh
 python update_python_in_workspace.py /home/mambauser/LillyMol/src/WORKSPACE
 bazelisk -h
 make all
 EOT
 # /home/mambauser/LillyMol/src/Molecule_Lib/linear_fingerprint_test.cc
 # line 373 EXPECT_EQ(_sfc.nbits(), 5l);
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -0,0 +1,14 @@
 version: '3'
 services:
  app:
    build: 
      context: .
      dockerfile: Dockerfile
      args: 
        CUDA_VERSION: 12.4
    image: hotwa/lillymol:20250213
    container_name: lilly_mol
    volumes:
      - ../test:/LillyMol/test
    command: >
      bash -c "tail -f /dev/null"
--- a/utils/main.ipynb
+++ b/utils/main.ipynb
--- a/utils/ring_analysis.py
+++ b/utils/ring_analysis.py
@@ -0,0 +1,83 @@
 from rdkit import Chem
 from joblib import Parallel, delayed
 import logging
 from collections import Counter
 # 定义日志配置
 logging.basicConfig(
    filename="rgroup_matching.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 # 定义 SMARTS 模式
 macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))")
 # 读取 SMI 文件
 smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi"
 with open(smi_file, 'r') as f:
    SMILES_list = [line.strip() for line in f if line.strip()]
 logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.")
 # 匹配和最大环统计函数
 def match_and_ring_analysis(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return smiles, None, 0  # 无效分子
    result = mol.GetSubstructMatches(macro)
    ri = mol.GetRingInfo()
    largest_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
    return smiles, result, largest_ring_size
 # 使用 joblib 并行处理
 logging.info("Starting SMARTS matching and ring analysis...")
 results = Parallel(n_jobs=-1)(delayed(match_and_ring_analysis)(s) for s in SMILES_list)
 # 分离成功、失败的分子以及最大环大小
 success = [smiles for smiles, result, _ in results if result]
 fail = [smiles for smiles, result, _ in results if not result]
 ring_sizes = [largest_ring_size for _, _, largest_ring_size in results]
 # 统计最大环频数
 ring_size_counter = Counter(ring_sizes)
 # 统计结果
 total = len(success) + len(fail)
 success_rate = len(success) / total * 100 if total > 0 else 0
 # 保存日志信息
 logging.info(f"Total molecules: {total}")
 logging.info(f"Success: {len(success)}")
 logging.info(f"Fail: {len(fail)}")
 logging.info(f"Success rate: {success_rate:.2f}%")
 logging.info(f"Ring size distribution: {dict(ring_size_counter)}")
 print(f"Total molecules: {total}")
 print(f"Success: {len(success)}")
 print(f"Fail: {len(fail)}")
 print(f"Success rate: {success_rate:.2f}%")
 print("Ring size distribution:")
 for size, count in sorted(ring_size_counter.items()):
    print(f"  Ring size {size}: {count}")
 # 将失败的分子写入到一个 SMI 文件
 fail_smi_file = "fail_molecules.smi"
 with open(fail_smi_file, "w") as ff:
    for smiles in fail:
        ff.write(smiles + "\n")
 logging.info(f"Failed molecules written to {fail_smi_file}.")
 print(f"Failed molecules written to {fail_smi_file}.")
 # 将环大小分布写入文件
 ring_size_file = "ring_size_distribution.txt"
 with open(ring_size_file, "w") as rf:
    rf.write("Ring Size\tCount\n")
    for size, count in sorted(ring_size_counter.items()):
        rf.write(f"{size}\t{count}\n")
 logging.info(f"Ring size distribution written to {ring_size_file}.")
 print(f"Ring size distribution written to {ring_size_file}.")
--- a/utils/search.ipynb
+++ b/utils/search.ipynb
@@ -0,0 +1,182 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34ca5db2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/root/micromamba/envs/qsar/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChEMBL 数据库存储路径: /srv/project/LillyMol/test/data/35/data/chembl_35/chembl_35_sqlite/chembl_35.db\n"
     ]
    }
   ],
   "source": [
    "import chembl_downloader\n",
    "# 指定存储目录\n",
    "prefix = '/srv/project/LillyMol/test/data'\n",
    "version = chembl_downloader.latest() # version 35\n",
    "\n",
    "# 下载并提取 ChEMBL 数据库，存放到指定目录\n",
    "path = chembl_downloader.download_extract_sqlite(version=version, prefix=[prefix])\n",
    "# chembl_downloader.download_sqlite(version)\n",
    "\n",
    "print(f\"ChEMBL 数据库存储路径: {path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd6f3b25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(0, 'molregno', 'BIGINT', 1, None, 0)\n",
      "(1, 'syn_type', 'VARCHAR(50)', 1, None, 0)\n",
      "(2, 'molsyn_id', 'BIGINT', 1, None, 1)\n",
      "(3, 'res_stem_id', 'BIGINT', 0, None, 0)\n",
      "(4, 'synonyms', 'VARCHAR(250)', 0, None, 0)\n"
     ]
    }
   ],
   "source": [
    "import chembl_downloader\n",
    "\n",
    "with chembl_downloader.connect(version='35', prefix=['/srv/project/LillyMol/test/data']) as conn:\n",
    "    cursor = conn.cursor()\n",
    "    cursor.execute(\"PRAGMA table_info(molecule_synonyms);\")\n",
    "    columns = cursor.fetchall()\n",
    "    for col in columns:\n",
    "        print(col)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c8390d91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>molregno</th>\n",
       "      <th>syn_type</th>\n",
       "      <th>molsyn_id</th>\n",
       "      <th>res_stem_id</th>\n",
       "      <th>synonyms</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>27340</td>\n",
       "      <td>TRADE_NAME</td>\n",
       "      <td>3001861</td>\n",
       "      <td>None</td>\n",
       "      <td>Brolene antibiotic</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>27340</td>\n",
       "      <td>TRADE_NAME</td>\n",
       "      <td>3001879</td>\n",
       "      <td>None</td>\n",
       "      <td>Golden eye antibiotic</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   molregno    syn_type  molsyn_id res_stem_id               synonyms\n",
       "0     27340  TRADE_NAME    3001861        None     Brolene antibiotic\n",
       "1     27340  TRADE_NAME    3001879        None  Golden eye antibiotic"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import chembl_downloader\n",
    "\n",
    "sql = \"\"\"\n",
    "SELECT *\n",
    "FROM molecule_synonyms\n",
    "WHERE syn_type='TRADE_NAME' AND synonyms LIKE '%antibiotic%'\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "# version 一般也建议加上，保证和你本地一致\n",
    "df = chembl_downloader.query(\n",
    "    sql,\n",
    "    version='35',   # 或者 version=version\n",
    "    prefix=['/srv/project/LillyMol/test/data']\n",
    ")\n",
    "df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0c56718",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "qsar",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/utils/search.py
+++ b/utils/search.py
@@ -0,0 +1,17 @@
 import sqlite3
 # 连接到数据库
 conn = sqlite3.connect('../test/data/35/data/chembl_35/chembl_35_sqlite/chembl_35.db')
 # 查询抗生素相关分子 (修正版)
 query = """
 SELECT m.chembl_id, m.pref_name, cs.molecule_synonym 
 FROM molecule_synonyms cs
 JOIN molecule_dictionary m ON cs.molregno = m.molregno
 WHERE cs.molecule_synonym LIKE '%antibiotic%'
 LIMIT 10
 """
 cursor = conn.execute(query)
 for row in cursor.fetchall():
    print(row)
--- a/utils/test_smart.py
+++ b/utils/test_smart.py
@@ -0,0 +1,60 @@
 from rdkit import Chem
 from joblib import Parallel, delayed
 import logging
 # 定义日志配置
 logging.basicConfig(
    filename="rgroup_matching.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
 )
 # 定义 SMARTS 模式
 macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))")
 # 读取 SMI 文件
 smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi"
 with open(smi_file, 'r') as f:
    SMILES_list = [line.strip() for line in f if line.strip()]
 logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.")
 # 匹配函数
 def match_smarts(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return smiles, None  # 无效分子
    result = mol.GetSubstructMatches(macro)
    return smiles, result
 # 使用 joblib 并行处理
 logging.info("Starting SMARTS matching...")
 results = Parallel(n_jobs=-1)(delayed(match_smarts)(s) for s in SMILES_list)
 # 分离成功和失败的分子
 success = [smiles for smiles, result in results if result]
 fail = [smiles for smiles, result in results if not result]
 # 统计结果
 total = len(success) + len(fail)
 success_rate = len(success) / total * 100 if total > 0 else 0
 # 保存日志信息
 logging.info(f"Total molecules: {total}")
 logging.info(f"Success: {len(success)}")
 logging.info(f"Fail: {len(fail)}")
 logging.info(f"Success rate: {success_rate:.2f}%")
 print(f"Total molecules: {total}")
 print(f"Success: {len(success)}")
 print(f"Fail: {len(fail)}")
 print(f"Success rate: {success_rate:.2f}%")
 # 将失败的分子写入到一个 SMI 文件
 fail_smi_file = "fail_molecules.smi"
 with open(fail_smi_file, "w") as ff:
    for smiles in fail:
        ff.write(smiles + "\n")
 logging.info(f"Failed molecules written to {fail_smi_file}.")
 print(f"Failed molecules written to {fail_smi_file}.")
		`@@ -0,0 +1 @@`
							`CC[C@H]1OC(=O)[][sugar][C@H](C)[sugar][]C[]C(=O)[][C@@H](O)[*]1`