first add
This commit is contained in:
292
notebooks/01_cheminformatics_quickstart.ipynb
Normal file
292
notebooks/01_cheminformatics_quickstart.ipynb
Normal file
@@ -0,0 +1,292 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# sqlmodel-pg-kit — Cheminformatics Quickstart\n\n",
|
||||
"This notebook demonstrates how to use `sqlmodel-pg-kit` for day-to-day data work, including:\n",
|
||||
"- Environment setup (SQLite smoke vs PostgreSQL)\n",
|
||||
"- Core CRUD with SQLModel sessions\n",
|
||||
"- Multi-table schema and joins for molecules and datasets\n",
|
||||
"- Optional RDKit + Mordred descriptor computation and storage\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 0. Install (in Jupyter)\n",
|
||||
"Uncomment as needed to install the package and optional chem deps."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# %pip install -e . pytest\n",
|
||||
"# Optional: RDKit + Mordred\n",
|
||||
"# %pip install rdkit-pypi mordred\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 1. Basic kit usage (SQLite smoke)\n",
|
||||
"Use the built-in smoke test pattern with an in-memory SQLite DB to verify CRUD paths."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"# Add the src directory to the path so we can import sqlmodel_pg_kit\n",
|
||||
"sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))\n",
|
||||
"\n",
|
||||
"from sqlmodel_pg_kit import db, create_all, Repository\n",
|
||||
"\n",
|
||||
"# Override to SQLite in-memory for quick check\n",
|
||||
"db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n",
|
||||
"db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n",
|
||||
"\n",
|
||||
"from typing import Optional\n",
|
||||
"from sqlmodel import SQLModel, Field\n",
|
||||
"\n",
|
||||
"class Hero(SQLModel, table=True):\n",
|
||||
" id: Optional[int] = Field(default=None, primary_key=True)\n",
|
||||
" name: str = Field(index=True)\n",
|
||||
" age: Optional[int] = None\n",
|
||||
"\n",
|
||||
"create_all()\n",
|
||||
"repo = Repository(Hero)\n",
|
||||
"from sqlmodel_pg_kit.db import get_session\n",
|
||||
"with get_session() as s:\n",
|
||||
" repo.create(s, {'name': 'Iron Man', 'age': 45})\n",
|
||||
" [h.name for h in repo.list(s)]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 2. Connect to PostgreSQL (optional)\n",
|
||||
"Export `SQL_*` or `PG*` variables in your shell before starting Jupyter or set them here."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"# Example (adjust to your environment, uncomment to set in-process):\n",
|
||||
"# os.environ['SQL_HOST'] = '127.0.0.1'\n",
|
||||
"# os.environ['SQL_PORT'] = '5432'\n",
|
||||
"# os.environ['SQL_USER'] = 'postgres'\n",
|
||||
"# os.environ['SQL_PASSWORD'] = 'change-me-strong'\n",
|
||||
"# os.environ['SQL_DATABASE'] = 'appdb'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 3. Cheminformatics schema (Molecule, Dataset, MoleculeDataset)\n",
|
||||
"Define models inline (you can also place them in your own package)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from __future__ import annotations\n",
|
||||
"from dataclasses import dataclass\n",
|
||||
"from datetime import datetime\n",
|
||||
"from typing import List, Optional\n",
|
||||
"\n",
|
||||
"from sqlmodel_pg_kit.db import get_session, engine\n",
|
||||
"from sqlmodel import SQLModel, Field, Relationship, select\n",
|
||||
"from sqlalchemy.orm import Mapped\n",
|
||||
"\n",
|
||||
"class MoleculeDataset(SQLModel, table=True):\n",
|
||||
" molecule_id: int = Field(foreign_key='molecule.id', primary_key=True)\n",
|
||||
" dataset_id: int = Field(foreign_key='dataset.id', primary_key=True)\n",
|
||||
" added_at: datetime = Field(default_factory=datetime.utcnow)\n",
|
||||
"\n",
|
||||
"class Molecule(SQLModel, table=True):\n",
|
||||
" id: Optional[int] = Field(default=None, primary_key=True)\n",
|
||||
" smiles: str = Field(index=True)\n",
|
||||
" selfies: Optional[str] = Field(default=None)\n",
|
||||
" qed: Optional[float] = Field(default=None, index=True)\n",
|
||||
" sa_score: Optional[float] = Field(default=None, index=True)\n",
|
||||
" created_at: datetime = Field(default_factory=datetime.utcnow)\n",
|
||||
" updated_at: datetime = Field(default_factory=datetime.utcnow)\n",
|
||||
" datasets: Mapped[List[\"Dataset\"]] = Relationship(back_populates=\"molecules\", link_model=MoleculeDataset)\n",
|
||||
"\n",
|
||||
"class Dataset(SQLModel, table=True):\n",
|
||||
" id: Optional[int] = Field(default=None, primary_key=True)\n",
|
||||
" name: str = Field(index=True)\n",
|
||||
" molecules: Mapped[List[\"Molecule\"]] = Relationship(back_populates=\"datasets\", link_model=MoleculeDataset)\n",
|
||||
"\n",
|
||||
"@dataclass\n",
|
||||
"class MoleculeDTO:\n",
|
||||
" smiles: str\n",
|
||||
" selfies: Optional[str] = None\n",
|
||||
" qed: Optional[float] = None\n",
|
||||
" sa_score: Optional[float] = None\n",
|
||||
" def to_model(self) -> Molecule:\n",
|
||||
" return Molecule(**vars(self))\n",
|
||||
"\n",
|
||||
"# Create tables for these models\n",
|
||||
"SQLModel.metadata.create_all(engine)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 4. CRUD and common queries\n",
|
||||
"Insert molecules/datasets, link them, filter and join."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Clean\n",
|
||||
"with get_session() as s:\n",
|
||||
" s.execute(MoleculeDataset.__table__.delete())\n",
|
||||
" s.execute(Molecule.__table__.delete())\n",
|
||||
" s.execute(Dataset.__table__.delete())\n",
|
||||
" s.commit()\n",
|
||||
"\n",
|
||||
"# Insert molecules via DTO\n",
|
||||
"mols = [\n",
|
||||
" MoleculeDTO(smiles='CCO', qed=0.45, sa_score=2.1),\n",
|
||||
" MoleculeDTO(smiles='c1ccccc1', qed=0.76, sa_score=3.5),\n",
|
||||
" MoleculeDTO(smiles='CCN(CC)CC', qed=0.62, sa_score=2.8),\n",
|
||||
"]\n",
|
||||
"with get_session() as s:\n",
|
||||
" s.add_all([dto.to_model() for dto in mols])\n",
|
||||
" s.commit()\n",
|
||||
"\n",
|
||||
"# Datasets and linking\n",
|
||||
"with get_session() as s:\n",
|
||||
" ds_train = Dataset(name='train'); ds_holdout = Dataset(name='holdout')\n",
|
||||
" s.add_all([ds_train, ds_holdout]); s.commit(); s.refresh(ds_train); s.refresh(ds_holdout)\n",
|
||||
" mol_list = s.exec(select(Molecule).order_by(Molecule.id.asc())).all()\n",
|
||||
" links = [\n",
|
||||
" MoleculeDataset(molecule_id=mol_list[0].id, dataset_id=ds_train.id),\n",
|
||||
" MoleculeDataset(molecule_id=mol_list[1].id, dataset_id=ds_train.id),\n",
|
||||
" MoleculeDataset(molecule_id=mol_list[2].id, dataset_id=ds_holdout.id),\n",
|
||||
" ]\n",
|
||||
" s.add_all(links); s.commit()\n",
|
||||
"\n",
|
||||
"# Update one\n",
|
||||
"with get_session() as s:\n",
|
||||
" mol = s.exec(select(Molecule).where(Molecule.smiles=='CCO')).one()\n",
|
||||
" mol.qed = 0.50; s.add(mol); s.commit(); s.refresh(mol)\n",
|
||||
" mol.qed\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Filters and joins\n",
|
||||
"from sqlalchemy.orm import selectinload\n",
|
||||
"\n",
|
||||
"with get_session() as s:\n",
|
||||
" hi_qed = s.exec(select(Molecule).where(Molecule.qed>=0.6).order_by(Molecule.sa_score.asc())).all()\n",
|
||||
" hi_qed_view = [(m.smiles, m.qed, m.sa_score) for m in hi_qed]\n",
|
||||
"\n",
|
||||
"with get_session() as s:\n",
|
||||
" stmt = select(Molecule).options(selectinload(Molecule.datasets)).order_by(Molecule.id.asc())\n",
|
||||
" mols_with_ds = s.exec(stmt).all()\n",
|
||||
" mols_with_ds_view = [(m.smiles, [d.name for d in m.datasets]) for m in mols_with_ds]\n",
|
||||
"\n",
|
||||
"with get_session() as s:\n",
|
||||
" stmt = (select(Molecule)\n",
|
||||
" .join(MoleculeDataset, Molecule.id==MoleculeDataset.molecule_id)\n",
|
||||
" .join(Dataset, Dataset.id==MoleculeDataset.dataset_id)\n",
|
||||
" .where(Dataset.name=='train')\n",
|
||||
" .order_by(Molecule.id.asc()))\n",
|
||||
" train_mols = s.exec(stmt).all()\n",
|
||||
" train_mols_view = [m.smiles for m in train_mols]\n",
|
||||
"\n",
|
||||
"hi_qed_view, mols_with_ds_view, train_mols_view\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5. Optional: RDKit + Mordred integration\n",
|
||||
"Compute descriptors and store them. If you prefer flexible storage, use a JSONB column or a normalized EAV table."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" from rdkit import Chem\n",
|
||||
" from rdkit.Chem import QED\n",
|
||||
" from mordred import Calculator, descriptors\n",
|
||||
" rdkit_ok = True\n",
|
||||
"except Exception as e:\n",
|
||||
" rdkit_ok = False\n",
|
||||
" print('RDKit/Mordred not available in this environment. Skipping demo.\n', e)\n",
|
||||
"\n",
|
||||
"if rdkit_ok:\n",
|
||||
" mol = Chem.MolFromSmiles('c1ccccc1O')\n",
|
||||
" qed = float(QED.qed(mol))\n",
|
||||
" calc = Calculator(descriptors, ignore_3D=True)\n",
|
||||
" md = calc(mol)\n",
|
||||
" # keep numeric descriptors only\n",
|
||||
" desc = {k: float(v) for k, v in md.items() if v is not None and isinstance(v, (int, float))}\n",
|
||||
" print('qed:', qed, 'num_desc:', len(desc))\n",
|
||||
"\n",
|
||||
" # Upsert molecule with refined qed as a column; optionally also persist `desc` via JSONB/EAV patterns.\n",
|
||||
" with get_session() as s:\n",
|
||||
" m = s.exec(select(Molecule).where(Molecule.smiles=='c1ccccc1O')).first()\n",
|
||||
" if m is None:\n",
|
||||
" m = Molecule(smiles='c1ccccc1O', qed=qed)\n",
|
||||
" else:\n",
|
||||
" m.qed = qed\n",
|
||||
" s.add(m); s.commit(); s.refresh(m)\n",
|
||||
" print('Stored molecule id:', m.id)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python",
|
||||
"pygments_lexer": "ipython3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user