first add

This commit is contained in:
2025-08-17 22:18:45 +08:00
commit 093d8efd3b
32 changed files with 3531 additions and 0 deletions

View File

@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# sqlmodel-pg-kit — Cheminformatics Quickstart\n\n",
"This notebook demonstrates how to use `sqlmodel-pg-kit` for day-to-day data work, including:\n",
"- Environment setup (SQLite smoke vs PostgreSQL)\n",
"- Core CRUD with SQLModel sessions\n",
"- Multi-table schema and joins for molecules and datasets\n",
"- Optional RDKit + Mordred descriptor computation and storage\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Install (in Jupyter)\n",
"Uncomment as needed to install the package and optional chem deps."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# %pip install -e . pytest\n",
"# Optional: RDKit + Mordred\n",
"# %pip install rdkit-pypi mordred\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Basic kit usage (SQLite smoke)\n",
"Use the built-in smoke test pattern with an in-memory SQLite DB to verify CRUD paths."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the src directory to the path so we can import sqlmodel_pg_kit\n",
"sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))\n",
"\n",
"from sqlmodel_pg_kit import db, create_all, Repository\n",
"\n",
"# Override to SQLite in-memory for quick check\n",
"db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n",
"db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n",
"\n",
"from typing import Optional\n",
"from sqlmodel import SQLModel, Field\n",
"\n",
"class Hero(SQLModel, table=True):\n",
" id: Optional[int] = Field(default=None, primary_key=True)\n",
" name: str = Field(index=True)\n",
" age: Optional[int] = None\n",
"\n",
"create_all()\n",
"repo = Repository(Hero)\n",
"from sqlmodel_pg_kit.db import get_session\n",
"with get_session() as s:\n",
" repo.create(s, {'name': 'Iron Man', 'age': 45})\n",
" [h.name for h in repo.list(s)]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Connect to PostgreSQL (optional)\n",
"Export `SQL_*` or `PG*` variables in your shell before starting Jupyter or set them here."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"# Example (adjust to your environment, uncomment to set in-process):\n",
"# os.environ['SQL_HOST'] = '127.0.0.1'\n",
"# os.environ['SQL_PORT'] = '5432'\n",
"# os.environ['SQL_USER'] = 'postgres'\n",
"# os.environ['SQL_PASSWORD'] = 'change-me-strong'\n",
"# os.environ['SQL_DATABASE'] = 'appdb'\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Cheminformatics schema (Molecule, Dataset, MoleculeDataset)\n",
"Define models inline (you can also place them in your own package)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"from dataclasses import dataclass\n",
"from datetime import datetime\n",
"from typing import List, Optional\n",
"\n",
"from sqlmodel_pg_kit.db import get_session, engine\n",
"from sqlmodel import SQLModel, Field, Relationship, select\n",
"from sqlalchemy.orm import Mapped\n",
"\n",
"class MoleculeDataset(SQLModel, table=True):\n",
" molecule_id: int = Field(foreign_key='molecule.id', primary_key=True)\n",
" dataset_id: int = Field(foreign_key='dataset.id', primary_key=True)\n",
" added_at: datetime = Field(default_factory=datetime.utcnow)\n",
"\n",
"class Molecule(SQLModel, table=True):\n",
" id: Optional[int] = Field(default=None, primary_key=True)\n",
" smiles: str = Field(index=True)\n",
" selfies: Optional[str] = Field(default=None)\n",
" qed: Optional[float] = Field(default=None, index=True)\n",
" sa_score: Optional[float] = Field(default=None, index=True)\n",
" created_at: datetime = Field(default_factory=datetime.utcnow)\n",
" updated_at: datetime = Field(default_factory=datetime.utcnow)\n",
" datasets: Mapped[List[\"Dataset\"]] = Relationship(back_populates=\"molecules\", link_model=MoleculeDataset)\n",
"\n",
"class Dataset(SQLModel, table=True):\n",
" id: Optional[int] = Field(default=None, primary_key=True)\n",
" name: str = Field(index=True)\n",
" molecules: Mapped[List[\"Molecule\"]] = Relationship(back_populates=\"datasets\", link_model=MoleculeDataset)\n",
"\n",
"@dataclass\n",
"class MoleculeDTO:\n",
" smiles: str\n",
" selfies: Optional[str] = None\n",
" qed: Optional[float] = None\n",
" sa_score: Optional[float] = None\n",
" def to_model(self) -> Molecule:\n",
" return Molecule(**vars(self))\n",
"\n",
"# Create tables for these models\n",
"SQLModel.metadata.create_all(engine)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. CRUD and common queries\n",
"Insert molecules/datasets, link them, filter and join."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Clean\n",
"with get_session() as s:\n",
" s.execute(MoleculeDataset.__table__.delete())\n",
" s.execute(Molecule.__table__.delete())\n",
" s.execute(Dataset.__table__.delete())\n",
" s.commit()\n",
"\n",
"# Insert molecules via DTO\n",
"mols = [\n",
" MoleculeDTO(smiles='CCO', qed=0.45, sa_score=2.1),\n",
" MoleculeDTO(smiles='c1ccccc1', qed=0.76, sa_score=3.5),\n",
" MoleculeDTO(smiles='CCN(CC)CC', qed=0.62, sa_score=2.8),\n",
"]\n",
"with get_session() as s:\n",
" s.add_all([dto.to_model() for dto in mols])\n",
" s.commit()\n",
"\n",
"# Datasets and linking\n",
"with get_session() as s:\n",
" ds_train = Dataset(name='train'); ds_holdout = Dataset(name='holdout')\n",
" s.add_all([ds_train, ds_holdout]); s.commit(); s.refresh(ds_train); s.refresh(ds_holdout)\n",
" mol_list = s.exec(select(Molecule).order_by(Molecule.id.asc())).all()\n",
" links = [\n",
" MoleculeDataset(molecule_id=mol_list[0].id, dataset_id=ds_train.id),\n",
" MoleculeDataset(molecule_id=mol_list[1].id, dataset_id=ds_train.id),\n",
" MoleculeDataset(molecule_id=mol_list[2].id, dataset_id=ds_holdout.id),\n",
" ]\n",
" s.add_all(links); s.commit()\n",
"\n",
"# Update one\n",
"with get_session() as s:\n",
" mol = s.exec(select(Molecule).where(Molecule.smiles=='CCO')).one()\n",
" mol.qed = 0.50; s.add(mol); s.commit(); s.refresh(mol)\n",
" mol.qed\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Filters and joins\n",
"from sqlalchemy.orm import selectinload\n",
"\n",
"with get_session() as s:\n",
" hi_qed = s.exec(select(Molecule).where(Molecule.qed>=0.6).order_by(Molecule.sa_score.asc())).all()\n",
" hi_qed_view = [(m.smiles, m.qed, m.sa_score) for m in hi_qed]\n",
"\n",
"with get_session() as s:\n",
" stmt = select(Molecule).options(selectinload(Molecule.datasets)).order_by(Molecule.id.asc())\n",
" mols_with_ds = s.exec(stmt).all()\n",
" mols_with_ds_view = [(m.smiles, [d.name for d in m.datasets]) for m in mols_with_ds]\n",
"\n",
"with get_session() as s:\n",
" stmt = (select(Molecule)\n",
" .join(MoleculeDataset, Molecule.id==MoleculeDataset.molecule_id)\n",
" .join(Dataset, Dataset.id==MoleculeDataset.dataset_id)\n",
" .where(Dataset.name=='train')\n",
" .order_by(Molecule.id.asc()))\n",
" train_mols = s.exec(stmt).all()\n",
" train_mols_view = [m.smiles for m in train_mols]\n",
"\n",
"hi_qed_view, mols_with_ds_view, train_mols_view\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Optional: RDKit + Mordred integration\n",
"Compute descriptors and store them. If you prefer flexible storage, use a JSONB column or a normalized EAV table."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" from rdkit import Chem\n",
" from rdkit.Chem import QED\n",
" from mordred import Calculator, descriptors\n",
" rdkit_ok = True\n",
"except Exception as e:\n",
" rdkit_ok = False\n",
" print('RDKit/Mordred not available in this environment. Skipping demo.\n', e)\n",
"\n",
"if rdkit_ok:\n",
" mol = Chem.MolFromSmiles('c1ccccc1O')\n",
" qed = float(QED.qed(mol))\n",
" calc = Calculator(descriptors, ignore_3D=True)\n",
" md = calc(mol)\n",
" # keep numeric descriptors only\n",
" desc = {k: float(v) for k, v in md.items() if v is not None and isinstance(v, (int, float))}\n",
" print('qed:', qed, 'num_desc:', len(desc))\n",
"\n",
" # Upsert molecule with refined qed as a column; optionally also persist `desc` via JSONB/EAV patterns.\n",
" with get_session() as s:\n",
" m = s.exec(select(Molecule).where(Molecule.smiles=='c1ccccc1O')).first()\n",
" if m is None:\n",
" m = Molecule(smiles='c1ccccc1O', qed=qed)\n",
" else:\n",
" m.qed = qed\n",
" s.add(m); s.commit(); s.refresh(m)\n",
" print('Stored molecule id:', m.id)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,18 @@
{
"cells": [
{"cell_type":"markdown","metadata":{},"source":["# 01 — Sync CRUD Tutorial\n\n","Covers create/get/list/update/delete using the kit's helpers. Optional SQLite override for quick demo."]},
{"cell_type":"markdown","metadata":{},"source":["## 0. Install (optional)\n","Uncomment if you haven't installed the package in this env."]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# %pip install -e . pytest\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 1. Optional: use in-memory SQLite for demo\n","Comment this cell if you want to use Postgres via SQL_*/PG* env vars."]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from sqlmodel_pg_kit import db\n","db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n","db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 2. CRUD operations (generic Repository)\n","Define a simple model and use the generic Repository for CRUD."]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from typing import Optional\n","from sqlmodel import SQLModel, Field\n","from sqlmodel_pg_kit import create_all, Repository\n","from sqlmodel_pg_kit.db import get_session\n","\n","class Hero(SQLModel, table=True):\n"," id: Optional[int] = Field(default=None, primary_key=True)\n"," name: str = Field(index=True)\n"," age: Optional[int] = None\n","\n","create_all()\n","repo = Repository(Hero)\n","with get_session() as s:\n"," h = repo.create(s, {'name': 'Alice', 'age': 20})\n"," h\n"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["with get_session() as s:\n"," h2 = repo.get(s, h.id)\n"," h2\n"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["with get_session() as s:\n"," h3 = repo.update(s, h.id, age=21)\n"," h3\n"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["with get_session() as s:\n"," page = repo.list(s, page=1, size=5)\n"," [(x.id, x.name, x.age) for x in page]\n"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["with get_session() as s:\n"," ok = repo.delete(s, h.id)\n"," ok\n"]}
],
"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "pygments_lexer": "ipython3"}},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,15 @@
{
"cells": [
{"cell_type":"markdown","metadata":{},"source":["# 02 — Bulk Insert and Filters\n\n","Demonstrates bulk inserts and filtering with SQLModel expressions."]},
{"cell_type":"markdown","metadata":{},"source":["## 0. Install (optional)"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# %pip install -e . pytest\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 1. Optional: SQLite in-memory for demo"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from sqlmodel_pg_kit import db\n","db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n","db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 2. Bulk insert and query"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from typing import List, Optional\n","from sqlmodel import select, SQLModel, Field\n","from sqlmodel_pg_kit import create_all, Repository\n","from sqlmodel_pg_kit.db import get_session\n","\n","class Hero(SQLModel, table=True):\n"," id: Optional[int] = Field(default=None, primary_key=True)\n"," name: str = Field(index=True)\n"," age: Optional[int] = None\n","\n","create_all()\n","repo = Repository(Hero)\n","# Clean slate\n","with get_session() as s:\n"," s.exec(select(Hero)) # warmup\n"," s.execute(Hero.__table__.delete())\n"," s.commit()\n"," rows = [\n"," {'name': 'PG Hero', 'age': 1},\n"," {'name': 'PG Hero', 'age': 2},\n"," {'name': 'Bob', 'age': 30},\n"," {'name': 'Carol', 'age': 40},\n"," ]\n"," repo.bulk_insert(s, rows)\n","\n","with get_session() as s:\n"," heroes: List[Hero] = s.exec(select(Hero).where(Hero.name=='PG Hero')).all()\n"," [(h.id, h.age) for h in heroes]\n"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["with get_session() as s:\n"," res = s.exec(select(Hero).where(Hero.age>=2).order_by(Hero.age.asc())).all()\n","[(h.name, h.age) for h in res]\n"]}
],
"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "pygments_lexer": "ipython3"}},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,14 @@
{
"cells": [
{"cell_type":"markdown","metadata":{},"source":["# 03 — Relationships (Team ↔ Hero)\n\n","Demonstrates one-to-many relationship and eager loading with selectinload."]},
{"cell_type":"markdown","metadata":{},"source":["## 0. Install (optional)"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# %pip install -e . pytest\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 1. Optional: SQLite in-memory for demo"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from sqlmodel_pg_kit import db\n","db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n","db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 2. Create Team and Hero, eager load with selectinload"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from typing import List, Optional\n","from sqlalchemy.orm import selectinload\n","from sqlmodel import select, SQLModel, Field, Relationship\n","from sqlmodel_pg_kit import create_all\n","from sqlmodel_pg_kit.db import get_session\n","\n","class Team(SQLModel, table=True):\n"," id: Optional[int] = Field(default=None, primary_key=True)\n"," name: str = Field(index=True)\n"," heroes: List['Hero'] = Relationship(back_populates='team')\n","\n","class Hero(SQLModel, table=True):\n"," id: Optional[int] = Field(default=None, primary_key=True)\n"," name: str = Field(index=True)\n"," age: Optional[int] = None\n"," team_id: Optional[int] = Field(default=None, foreign_key='team.id')\n"," team: Optional[Team] = Relationship(back_populates='heroes')\n","\n","create_all()\n","with get_session() as s:\n"," s.execute(Hero.__table__.delete())\n"," s.execute(Team.__table__.delete())\n"," s.commit()\n"," t = Team(name='Avengers'); s.add(t); s.commit(); s.refresh(t)\n"," s.add(Hero(name='Thor', age=1500, team_id=t.id))\n"," s.add(Hero(name='Hulk', age=49, team_id=t.id))\n"," s.commit()\n","\n","stmt = select(Hero).options(selectinload(Hero.team)).order_by(Hero.id.asc())\n","with get_session() as s:\n"," heroes: List[Hero] = s.exec(stmt).all()\n","[(h.name, h.team.name if h.team else None) for h in heroes]\n"]}
],
"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "pygments_lexer": "ipython3"}},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,14 @@
{
"cells": [
{"cell_type":"markdown","metadata":{},"source":["# 04 — Async CRUD Tutorial\n\n","Demonstrates async session usage. Includes optional SQLite async override."]},
{"cell_type":"markdown","metadata":{},"source":["## 0. Install (optional)"]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# %pip install -e . pytest\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 1. Optional: SQLite async override for demo\n","If you don't have Postgres ready, configure an async SQLite engine."]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker\n","from sqlmodel_pg_kit import db, create_all\n","# Keep sync paths on SQLite in-memory for create_all\n","db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n","db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n","create_all()\n","# Now override async engine/session to SQLite as well\n","db.async_engine = create_async_engine('sqlite+aiosqlite:///:memory:', echo=False)\n","db.AsyncSessionLocal = async_sessionmaker(db.async_engine, expire_on_commit=False)\n"]},
{"cell_type":"markdown","metadata":{},"source":["## 2. Async CRUD\n","Clean table, insert, and read back using async session."]},
{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import asyncio\n","from typing import Optional\n","from sqlmodel import select, SQLModel, Field\n","from sqlmodel_pg_kit import AsyncRepository, create_all\n","from sqlmodel_pg_kit.db import get_async_session\n","\n","class Hero(SQLModel, table=True):\n"," id: Optional[int] = Field(default=None, primary_key=True)\n"," name: str = Field(index=True)\n"," age: Optional[int] = None\n","\n","create_all()\n","repo = AsyncRepository(Hero)\n","\n","async def amain():\n"," async with get_async_session() as s:\n"," await s.execute(Hero.__table__.delete())\n"," await s.commit()\n"," await repo.create(s, {'name': 'Async Hero', 'age': 7})\n"," res = await s.execute(select(Hero))\n"," heroes = res.scalars().all()\n"," return [h.name for h in heroes]\n","\n","asyncio.run(amain())\n"]}
],
"metadata": {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "pygments_lexer": "ipython3"}},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,372 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cheminformatics Tutorial — Molecules, Datasets, CRUD & Joins\n\n",
"This notebook is a teaching version of `examples/05_cheminformatics.py`.\n",
"It demonstrates:\n",
"- Modeling molecules with descriptors (smiles, selfies, qed, sa_score)\n",
"- Linking molecules to datasets (many-to-many)\n",
"- Dataclass interop for fast inserts\n",
"- Common CRUD, filtering, eager loading, and joins\n",
"- Optional RDKit + Mordred descriptor computation (if installed)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Environment (micromamba)\n",
"In your shell, activate the env before launching Jupyter:\n",
"```bash\n",
"micromamba activate sqlmodel\n",
"jupyter lab # or jupyter notebook\n",
"```\n\n",
"Optional installs inside Jupyter (uncomment to run):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# %pip install -e . pytest\n",
"# Optional cheminformatics packages:\n",
"# %pip install rdkit-pypi mordred\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configure database connection\n",
"- For quick smoke in-memory SQLite, see the cell below.\n",
"- For PostgreSQL, ensure `SQL_*` or `PG*` env vars are set before starting Jupyter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sqlmodel_pg_kit import db, create_all as _create_all\n",
"\n",
"# QUICK OPTION: Use SQLite in-memory for learning/demo.\n",
"# Comment these two lines out if you prefer to use Postgres via environment variables.\n",
"db.cfg = db.DatabaseConfig(host='', port=0, user='', password='', database=':memory:', sslmode='disable')\n",
"db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n",
"_create_all() # create base kit models if any\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Define models & dataclass\n",
"We define `Molecule`, `Dataset`, and the link table `MoleculeDataset`.\n",
"We also provide a `MoleculeDTO` dataclass to show how to bring computed values\n",
"(e.g., from RDKit/Mordred pipelines) into SQLModel quickly."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"from dataclasses import dataclass\n",
"from datetime import datetime\n",
"from typing import List, Optional\n",
"\n",
"from sqlalchemy.orm import selectinload\n",
"from sqlmodel import SQLModel, Field, Relationship, select\n",
"from sqlmodel_pg_kit.db import get_session, engine\n",
"\n",
"class MoleculeDataset(SQLModel, table=True):\n",
" molecule_id: int = Field(foreign_key='molecule.id', primary_key=True)\n",
" dataset_id: int = Field(foreign_key='dataset.id', primary_key=True)\n",
" added_at: datetime = Field(default_factory=datetime.utcnow)\n",
"\n",
"class Molecule(SQLModel, table=True):\n",
" id: Optional[int] = Field(default=None, primary_key=True)\n",
" smiles: str = Field(index=True)\n",
" selfies: Optional[str] = Field(default=None)\n",
" qed: Optional[float] = Field(default=None, index=True)\n",
" sa_score: Optional[float] = Field(default=None, index=True)\n",
" created_at: datetime = Field(default_factory=datetime.utcnow)\n",
" updated_at: datetime = Field(default_factory=datetime.utcnow)\n",
" datasets: List['Dataset'] = Relationship(back_populates='molecules', link_model=MoleculeDataset)\n",
"\n",
"class Dataset(SQLModel, table=True):\n",
" id: Optional[int] = Field(default=None, primary_key=True)\n",
" name: str = Field(index=True)\n",
" molecules: List['Molecule'] = Relationship(back_populates='datasets', link_model=MoleculeDataset)\n",
"\n",
"@dataclass\n",
"class MoleculeDTO:\n",
" smiles: str\n",
" selfies: Optional[str] = None\n",
" qed: Optional[float] = None\n",
" sa_score: Optional[float] = None\n",
" def to_model(self) -> Molecule:\n",
" return Molecule(**vars(self))\n",
"\n",
"# Create the tables defined above\n",
"SQLModel.metadata.create_all(engine)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Clean slate (idempotent runs)\n",
"We delete existing rows to make this notebook repeatable."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" s.execute(MoleculeDataset.__table__.delete())\n",
" s.execute(Molecule.__table__.delete())\n",
" s.execute(Dataset.__table__.delete())\n",
" s.commit()\n",
"'cleaned'\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Insert molecules via dataclass\n",
"Create a few molecules as you would after computing descriptors upstream."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mols = [\n",
" MoleculeDTO(smiles='CCO', qed=0.45, sa_score=2.1),\n",
" MoleculeDTO(smiles='c1ccccc1', qed=0.76, sa_score=3.5),\n",
" MoleculeDTO(smiles='CCN(CC)CC', qed=0.62, sa_score=2.8),\n",
"]\n",
"with get_session() as s:\n",
" for dto in mols:\n",
" s.add(dto.to_model())\n",
" s.commit()\n",
"\n",
"with get_session() as s:\n",
" inserted = s.exec(select(Molecule).order_by(Molecule.id.asc())).all()\n",
"[(m.id, m.smiles, m.qed, m.sa_score) for m in inserted]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Create datasets and link molecules\n",
"Use a many-to-many link table to assign molecules to `train` or `holdout`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" ds_train = Dataset(name='train')\n",
" ds_holdout = Dataset(name='holdout')\n",
" s.add(ds_train); s.add(ds_holdout); s.commit()\n",
" s.refresh(ds_train); s.refresh(ds_holdout)\n",
" mol_list: List[Molecule] = s.exec(select(Molecule).order_by(Molecule.id.asc())).all()\n",
" links = [\n",
" MoleculeDataset(molecule_id=mol_list[0].id, dataset_id=ds_train.id),\n",
" MoleculeDataset(molecule_id=mol_list[1].id, dataset_id=ds_train.id),\n",
" MoleculeDataset(molecule_id=mol_list[2].id, dataset_id=ds_holdout.id),\n",
" ]\n",
" s.add_all(links); s.commit()\n",
"[(l.molecule_id, l.dataset_id) for l in links]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Update a descriptor (refined QED)\n",
"Typical pattern: load → modify → commit → refresh."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"with get_session() as s:\n",
" mol = s.exec(select(Molecule).where(Molecule.smiles=='CCO')).one()\n",
" mol.qed = 0.50\n",
" mol.updated_at = datetime.utcnow()\n",
" s.add(mol); s.commit(); s.refresh(mol)\n",
"(mol.id, mol.smiles, mol.qed)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Filtering and ordering\n",
"Examples: threshold on `qed` and ordering by `sa_score`; prefix search on smiles."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" hi_qed = s.exec(select(Molecule).where(Molecule.qed>=0.6).order_by(Molecule.sa_score.asc())).all()\n",
" hi_qed_view = [(m.smiles, m.qed, m.sa_score) for m in hi_qed]\n",
"\n",
"with get_session() as s:\n",
" starts_with_cc = s.exec(select(Molecule).where(Molecule.smiles.like('CC%'))).all()\n",
" starts_with_cc_view = [m.smiles for m in starts_with_cc]\n",
"\n",
"hi_qed_view, starts_with_cc_view\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Eager loading relationships (avoid N+1)\n",
"Read molecules with their datasets efficiently using `selectinload`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" stmt = select(Molecule).options(selectinload(Molecule.datasets)).order_by(Molecule.id.asc())\n",
" molecules = s.exec(stmt).all()\n",
"[(m.smiles, [d.name for d in m.datasets]) for m in molecules]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9. Join filtering\n",
"Return only molecules that belong to the `train` dataset."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" stmt = (select(Molecule)\n",
" .join(MoleculeDataset, Molecule.id==MoleculeDataset.molecule_id)\n",
" .join(Dataset, Dataset.id==MoleculeDataset.dataset_id)\n",
" .where(Dataset.name=='train')\n",
" .order_by(Molecule.id.asc()))\n",
" train_mols = s.exec(stmt).all()\n",
"[m.smiles for m in train_mols]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 10. Delete a molecule\n",
"Load → delete → commit; verify remaining molecules."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with get_session() as s:\n",
" target = s.exec(select(Molecule).where(Molecule.smiles=='CCN(CC)CC')).one()\n",
" s.delete(target); s.commit()\n",
" left = s.exec(select(Molecule).order_by(Molecule.id.asc())).all()\n",
"[m.smiles for m in left]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 11. Optional: RDKit + Mordred computation\n",
"If installed, compute descriptors and update a molecule (e.g., refine QED)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" from rdkit import Chem\n",
" from rdkit.Chem import QED\n",
" from mordred import Calculator, descriptors\n",
" ok = True\n",
"except Exception as e:\n",
" ok = False\n",
" print('RDKit/Mordred not available; skipping.\\n', e)\n",
"\n",
"if ok:\n",
" mol = Chem.MolFromSmiles('c1ccccc1O')\n",
" qed_val = float(QED.qed(mol))\n",
" calc = Calculator(descriptors, ignore_3D=True)\n",
" md = calc(mol)\n",
" num_desc = sum(1 for _ in md.items())\n",
" print('Computed QED:', qed_val, 'Mordred descriptors:', num_desc)\n",
" with get_session() as s:\n",
" m = s.exec(select(Molecule).where(Molecule.smiles=='c1ccccc1O')).first()\n",
" if m is None:\n",
" m = Molecule(smiles='c1ccccc1O', qed=qed_val)\n",
" else:\n",
" m.qed = qed_val\n",
" s.add(m); s.commit(); s.refresh(m)\n",
" (m.id, m.smiles, m.qed)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,172 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 06 — CSV → SQLModel → Table\n\n",
"Auto-generate a SQLModel from a CSV header, import rows, and query.\n",
"Demonstrates SQLite/Postgres switching and simple filtering."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 0. Install (optional) and choose backend\n",
"- Use SQLite in-memory or file by overriding `db.engine`\n",
"- Or rely on Postgres via `SQL_*`/`PG*` env vars"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# %pip install -e . pytest\n",
"from sqlmodel_pg_kit import db\n",
"# Uncomment ONE of the following to use SQLite:\n",
"# db.engine = db.create_engine('sqlite:///:memory:', echo=False)\n",
"# db.engine = db.create_engine('sqlite:///./demo.db', echo=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Build model from CSV and insert rows\n",
"We use the sample CSV at `data/molecules_sample.csv`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parameters\n",
"Set CSV path, optional class/table names, null sentinels, type overrides, and column renames."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# CSV path and optional names\n",
"csv_path = 'data/molecules_sample.csv'\n",
"class_name_override = None # e.g., 'Molecules'\n",
"table_name_override = None # e.g., 'molecules'\n",
"\n",
"# Null sentinels (in addition to default: '', na, nan, none, null)\n",
"custom_nulls = ['N/A']\n",
"\n",
"# Type overrides: name -> type (bool/int/float/str)\n",
"type_overrides = { # e.g., 'count': int, 'qed': float\n",
" # 'count': int,\n",
"} \n",
"\n",
"# Rename mappings: original header -> new name before sanitization\n",
"rename_map = { # e.g., 'sa': 'sa_score'\n",
" # 'sa': 'sa_score',\n",
"}\n",
"\n",
"# Columns to index after import (B-Tree)\n",
"index_columns = ['qed', 'sa_score']\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sqlmodel_pg_kit.csv_import import build_model_from_csv, insert_rows, create_indexes\n",
"from sqlmodel_pg_kit import create_all\n",
"from sqlmodel_pg_kit.db import get_session\n",
"\n",
"spec, rows = build_model_from_csv(\n",
" csv_path,\n",
" class_name=class_name_override,\n",
" table_name=table_name_override,\n",
" null_values=custom_nulls,\n",
" type_overrides=type_overrides,\n",
" rename_map=rename_map,\n",
" warn_on_nulls=True,\n",
")\n",
"spec.model, spec.table_name, spec.columns\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create table and insert\n",
"create_all()\n",
"with get_session() as s:\n",
" n = insert_rows(spec.model, rows, s)\n",
"n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Optionally create B-Tree indexes on selected columns\n",
"from sqlmodel_pg_kit import db\n",
"created = create_indexes(spec.model, index_columns, db.engine)\n",
"created\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Query a few rows and filters\n",
"Use SQLModel/SQLAlchemy expressions to filter by inferred columns."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sqlmodel import select\n",
"\n",
"with get_session() as s:\n",
" all_rows = s.exec(select(spec.model).order_by(spec.model.id.asc())).all()\n",
"[(r.id, r.smiles, r.qed, r.sa_score, r.active) for r in all_rows]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Filter: high QED and active\n",
"with get_session() as s:\n",
" hi = s.exec(select(spec.model).where((spec.model.qed>=0.6) & (spec.model.active==True))).all()\n",
"[(r.smiles, r.qed, r.active) for r in hi]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}