first add

2025-09-17 22:14:05 +08:00
commit 64d80d2e3c
9 changed files with 1534 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+test/
--- a/assets/ring16.smi
+++ b/assets/ring16.smi
@@ -0,0 +1 @@
+CC[C@H]1OC(=O)[*][*sugar*][C@H](C)[*sugar*][*]C[*]C(=O)[*][C@@H](O)[*]1
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,40 @@
+FROM mambaorg/micromamba:cuda12.4.1-ubuntu22.04
+ARG ROOT_PASSWD="micromamba"
+ENV ROOT_PASSWD=${ROOT_PASSWD}
+
+USER root
+RUN <<EOT
+#!/bin/bash
+apt update
+apt install vim git sudo bash-completion wget curl htop jq bzip2 npm unzip libz-dev libgomp1 libblas-dev liblapack-dev libzmq3-dev -y
+echo "mambauser ALL=(ALL) NOPASSWD: ALL" | sudo tee -a /etc/sudoers > /dev/null
+echo "mambauser:${ROOT_PASSWD}" | chpasswd
+EOT
+
+USER mambauser
+ENV LILLYMOL_HOME=/home/mambauser/LillyMol \
+    BUILD_DIR=Linux \
+    BUILD_BDB=1 \
+    BUILD_PYTHON=1
+RUN <<EOT
+#!/bin/bash
+echo "alias conda=micromamba" >> /home/mambauser/.bashrc
+echo "alias mamba=micromamba" >> /home/mambauser/.bashrc
+micromamba create -n lillymol python=3.10 pybind11 absl-py protobuf pandas scipy -y
+echo "mamba activate lillymol" >> /home/mambauser/.bashrc
+git clone https://github.com/IanAWatson/LillyMol /home/mambauser/LillyMol
+EOT
+
+WORKDIR /home/mambauser/LillyMol
+RUN <<EOT
+#!/bin/bash
+npm install -g @bazel/bazelisk
+cd src
+bash ./build_third_party.sh
+python update_python_in_workspace.py /home/mambauser/LillyMol/src/WORKSPACE
+bazelisk -h
+make all
+EOT
+
+# /home/mambauser/LillyMol/src/Molecule_Lib/linear_fingerprint_test.cc
+# line 373 EXPECT_EQ(_sfc.nbits(), 5l);
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -0,0 +1,14 @@
+version: '3'
+services:
+  app:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+      args: 
+        CUDA_VERSION: 12.4
+    image: hotwa/lillymol:20250213
+    container_name: lilly_mol
+    volumes:
+      - ../test:/LillyMol/test
+    command: >
+      bash -c "tail -f /dev/null"
--- a/utils/main.ipynb
+++ b/utils/main.ipynb
--- a/utils/ring_analysis.py
+++ b/utils/ring_analysis.py
@@ -0,0 +1,83 @@
+from rdkit import Chem
+from joblib import Parallel, delayed
+import logging
+from collections import Counter
+
+# 定义日志配置
+logging.basicConfig(
+    filename="rgroup_matching.log",
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+# 定义 SMARTS 模式
+macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))")
+
+# 读取 SMI 文件
+smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi"
+with open(smi_file, 'r') as f:
+    SMILES_list = [line.strip() for line in f if line.strip()]
+
+logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.")
+
+# 匹配和最大环统计函数
+def match_and_ring_analysis(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return smiles, None, 0  # 无效分子
+    
+    result = mol.GetSubstructMatches(macro)
+    ri = mol.GetRingInfo()
+    largest_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
+    
+    return smiles, result, largest_ring_size
+
+# 使用 joblib 并行处理
+logging.info("Starting SMARTS matching and ring analysis...")
+results = Parallel(n_jobs=-1)(delayed(match_and_ring_analysis)(s) for s in SMILES_list)
+
+# 分离成功、失败的分子以及最大环大小
+success = [smiles for smiles, result, _ in results if result]
+fail = [smiles for smiles, result, _ in results if not result]
+ring_sizes = [largest_ring_size for _, _, largest_ring_size in results]
+
+# 统计最大环频数
+ring_size_counter = Counter(ring_sizes)
+
+# 统计结果
+total = len(success) + len(fail)
+success_rate = len(success) / total * 100 if total > 0 else 0
+
+# 保存日志信息
+logging.info(f"Total molecules: {total}")
+logging.info(f"Success: {len(success)}")
+logging.info(f"Fail: {len(fail)}")
+logging.info(f"Success rate: {success_rate:.2f}%")
+logging.info(f"Ring size distribution: {dict(ring_size_counter)}")
+
+print(f"Total molecules: {total}")
+print(f"Success: {len(success)}")
+print(f"Fail: {len(fail)}")
+print(f"Success rate: {success_rate:.2f}%")
+print("Ring size distribution:")
+for size, count in sorted(ring_size_counter.items()):
+    print(f"  Ring size {size}: {count}")
+
+# 将失败的分子写入到一个 SMI 文件
+fail_smi_file = "fail_molecules.smi"
+with open(fail_smi_file, "w") as ff:
+    for smiles in fail:
+        ff.write(smiles + "\n")
+
+logging.info(f"Failed molecules written to {fail_smi_file}.")
+print(f"Failed molecules written to {fail_smi_file}.")
+
+# 将环大小分布写入文件
+ring_size_file = "ring_size_distribution.txt"
+with open(ring_size_file, "w") as rf:
+    rf.write("Ring Size\tCount\n")
+    for size, count in sorted(ring_size_counter.items()):
+        rf.write(f"{size}\t{count}\n")
+
+logging.info(f"Ring size distribution written to {ring_size_file}.")
+print(f"Ring size distribution written to {ring_size_file}.")
--- a/utils/search.ipynb
+++ b/utils/search.ipynb
@@ -0,0 +1,182 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34ca5db2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/root/micromamba/envs/qsar/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ChEMBL 数据库存储路径: /srv/project/LillyMol/test/data/35/data/chembl_35/chembl_35_sqlite/chembl_35.db\n"
+     ]
+    }
+   ],
+   "source": [
+    "import chembl_downloader\n",
+    "# 指定存储目录\n",
+    "prefix = '/srv/project/LillyMol/test/data'\n",
+    "version = chembl_downloader.latest() # version 35\n",
+    "\n",
+    "# 下载并提取 ChEMBL 数据库，存放到指定目录\n",
+    "path = chembl_downloader.download_extract_sqlite(version=version, prefix=[prefix])\n",
+    "# chembl_downloader.download_sqlite(version)\n",
+    "\n",
+    "print(f\"ChEMBL 数据库存储路径: {path}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dd6f3b25",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(0, 'molregno', 'BIGINT', 1, None, 0)\n",
+      "(1, 'syn_type', 'VARCHAR(50)', 1, None, 0)\n",
+      "(2, 'molsyn_id', 'BIGINT', 1, None, 1)\n",
+      "(3, 'res_stem_id', 'BIGINT', 0, None, 0)\n",
+      "(4, 'synonyms', 'VARCHAR(250)', 0, None, 0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import chembl_downloader\n",
+    "\n",
+    "with chembl_downloader.connect(version='35', prefix=['/srv/project/LillyMol/test/data']) as conn:\n",
+    "    cursor = conn.cursor()\n",
+    "    cursor.execute(\"PRAGMA table_info(molecule_synonyms);\")\n",
+    "    columns = cursor.fetchall()\n",
+    "    for col in columns:\n",
+    "        print(col)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c8390d91",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>molregno</th>\n",
+       "      <th>syn_type</th>\n",
+       "      <th>molsyn_id</th>\n",
+       "      <th>res_stem_id</th>\n",
+       "      <th>synonyms</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>27340</td>\n",
+       "      <td>TRADE_NAME</td>\n",
+       "      <td>3001861</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Brolene antibiotic</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>27340</td>\n",
+       "      <td>TRADE_NAME</td>\n",
+       "      <td>3001879</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Golden eye antibiotic</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   molregno    syn_type  molsyn_id res_stem_id               synonyms\n",
+       "0     27340  TRADE_NAME    3001861        None     Brolene antibiotic\n",
+       "1     27340  TRADE_NAME    3001879        None  Golden eye antibiotic"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import chembl_downloader\n",
+    "\n",
+    "sql = \"\"\"\n",
+    "SELECT *\n",
+    "FROM molecule_synonyms\n",
+    "WHERE syn_type='TRADE_NAME' AND synonyms LIKE '%antibiotic%'\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "# version 一般也建议加上，保证和你本地一致\n",
+    "df = chembl_downloader.query(\n",
+    "    sql,\n",
+    "    version='35',   # 或者 version=version\n",
+    "    prefix=['/srv/project/LillyMol/test/data']\n",
+    ")\n",
+    "df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0c56718",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "qsar",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/utils/search.py
+++ b/utils/search.py
@@ -0,0 +1,17 @@
+import sqlite3
+
+# 连接到数据库
+conn = sqlite3.connect('../test/data/35/data/chembl_35/chembl_35_sqlite/chembl_35.db')
+
+# 查询抗生素相关分子 (修正版)
+query = """
+SELECT m.chembl_id, m.pref_name, cs.molecule_synonym 
+FROM molecule_synonyms cs
+JOIN molecule_dictionary m ON cs.molregno = m.molregno
+WHERE cs.molecule_synonym LIKE '%antibiotic%'
+LIMIT 10
+"""
+
+cursor = conn.execute(query)
+for row in cursor.fetchall():
+    print(row)
--- a/utils/test_smart.py
+++ b/utils/test_smart.py
@@ -0,0 +1,60 @@
+from rdkit import Chem
+from joblib import Parallel, delayed
+import logging
+
+# 定义日志配置
+logging.basicConfig(
+    filename="rgroup_matching.log",
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+# 定义 SMARTS 模式
+macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))")
+
+# 读取 SMI 文件
+smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi"
+with open(smi_file, 'r') as f:
+    SMILES_list = [line.strip() for line in f if line.strip()]
+
+logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.")
+
+# 匹配函数
+def match_smarts(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return smiles, None  # 无效分子
+    result = mol.GetSubstructMatches(macro)
+    return smiles, result
+
+# 使用 joblib 并行处理
+logging.info("Starting SMARTS matching...")
+results = Parallel(n_jobs=-1)(delayed(match_smarts)(s) for s in SMILES_list)
+
+# 分离成功和失败的分子
+success = [smiles for smiles, result in results if result]
+fail = [smiles for smiles, result in results if not result]
+
+# 统计结果
+total = len(success) + len(fail)
+success_rate = len(success) / total * 100 if total > 0 else 0
+
+# 保存日志信息
+logging.info(f"Total molecules: {total}")
+logging.info(f"Success: {len(success)}")
+logging.info(f"Fail: {len(fail)}")
+logging.info(f"Success rate: {success_rate:.2f}%")
+
+print(f"Total molecules: {total}")
+print(f"Success: {len(success)}")
+print(f"Fail: {len(fail)}")
+print(f"Success rate: {success_rate:.2f}%")
+
+# 将失败的分子写入到一个 SMI 文件
+fail_smi_file = "fail_molecules.smi"
+with open(fail_smi_file, "w") as ff:
+    for smiles in fail:
+        ff.write(smiles + "\n")
+
+logging.info(f"Failed molecules written to {fail_smi_file}.")
+print(f"Failed molecules written to {fail_smi_file}.")
				`@@ -0,0 +1 @@`
				`CC[C@H]1OC(=O)[][sugar][C@H](C)[sugar][]C[]C(=O)[][C@@H](O)[*]1`