Compare commits
5 Commits
360cbc487e
...
f6bf9e85a3
| Author | SHA1 | Date | |
|---|---|---|---|
| f6bf9e85a3 | |||
| 46a438dd36 | |||
| 07ba27be2b | |||
| f43f0520ce | |||
| bb42044faf |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -68,3 +68,4 @@ data/
|
|||||||
output/
|
output/
|
||||||
site/
|
site/
|
||||||
# docs/ source files should be tracked, only ignore generated site/
|
# docs/ source files should be tracked, only ignore generated site/
|
||||||
|
validation_output/
|
||||||
|
|||||||
90
README.md
90
README.md
@@ -62,6 +62,96 @@ pixi run macro-lactone-toolkit fragment \
|
|||||||
|
|
||||||
默认读取 `smiles` 列;若存在 `id` 列则将其作为 `parent_id`,否则自动生成 `row_<index>`。
|
默认读取 `smiles` 列;若存在 `id` 列则将其作为 `parent_id`,否则自动生成 `row_<index>`。
|
||||||
|
|
||||||
|
## MacrolactoneDB 验证模块
|
||||||
|
|
||||||
|
用于对 MacrolactoneDB 数据库进行抽样验证、分类、侧链断裂和数据库存储。
|
||||||
|
|
||||||
|
### 验证脚本使用
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 基本使用(10% 分层抽样)
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data/MacrolactoneDB/ring12_20/temp.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--sample-ratio 0.1
|
||||||
|
|
||||||
|
# 处理全量数据
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data/MacrolactoneDB/ring12_20/temp.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--sample-ratio 1.0
|
||||||
|
|
||||||
|
# 指定列名(如果 CSV 列名不同)
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--id-col ml_id \
|
||||||
|
--chembl-id-col IDs \
|
||||||
|
--smiles-col smiles
|
||||||
|
```
|
||||||
|
|
||||||
|
### 输出结构
|
||||||
|
|
||||||
|
```
|
||||||
|
validation_output/
|
||||||
|
├── README.md # 目录说明
|
||||||
|
├── fragments.db # SQLite 数据库
|
||||||
|
├── fragment_library.csv # 最终片段库导出(含 has_dummy_atom / splice_ready)
|
||||||
|
├── summary.csv # 汇总表(含 ml_id, chembl_id)
|
||||||
|
├── summary_statistics.json # 统计信息
|
||||||
|
├── ring_size_12/ # 按环大小组织
|
||||||
|
├── ring_size_13/
|
||||||
|
...
|
||||||
|
└── ring_size_20/
|
||||||
|
├── standard/
|
||||||
|
│ ├── numbered/ # 带编号的环图(文件名使用 ml_id)
|
||||||
|
│ │ └── {ml_id}_numbered.png
|
||||||
|
│ └── sidechains/ # 片段图
|
||||||
|
│ └── {ml_id}/
|
||||||
|
│ └── {ml_id}_frag_{n}_pos{pos}.png
|
||||||
|
├── non_standard/original/
|
||||||
|
└── rejected/original/
|
||||||
|
```
|
||||||
|
|
||||||
|
### 数据库查询示例
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看表结构
|
||||||
|
sqlite3 validation_output/fragments.db ".tables"
|
||||||
|
|
||||||
|
# 查询标准大环内酯
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT ml_id, chembl_id, ring_size, num_sidechains \
|
||||||
|
FROM parent_molecules \
|
||||||
|
WHERE classification='standard_macrolactone' LIMIT 5;"
|
||||||
|
|
||||||
|
# 查询最终片段库
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT source_type, source_parent_ml_id, cleavage_position, has_dummy_atom, splice_ready \
|
||||||
|
FROM fragment_library_entries LIMIT 10;"
|
||||||
|
|
||||||
|
# 查询片段
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT fragment_id, cleavage_position, dummy_isotope, has_dummy_atom, dummy_atom_count \
|
||||||
|
FROM side_chain_fragments LIMIT 10;"
|
||||||
|
|
||||||
|
# 按环大小统计
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 关键字段说明
|
||||||
|
|
||||||
|
| 字段 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `ml_id` | MacrolactoneDB 唯一 ID(如 ML00000001),用于文件命名 |
|
||||||
|
| `chembl_id` | 原始 CHEMBL ID(如 CHEMBL94657),可能为空 |
|
||||||
|
| `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone |
|
||||||
|
| `dummy_isotope` | 裂解位置编号,用于片段重建 |
|
||||||
|
| `cleavage_position` | 环上的断裂位置 |
|
||||||
|
| `has_dummy_atom` | 该片段是否带 dummy 原子,可用于区分可直接拼接片段 |
|
||||||
|
| `splice_ready` | 是否与当前单锚点拼接流程直接兼容 |
|
||||||
|
|
||||||
## Legacy Scripts
|
## Legacy Scripts
|
||||||
|
|
||||||
`scripts/` 目录保留为薄封装或迁移提示,不再承载核心实现。正式接口以 `macro_lactone_toolkit.*` 与 `macro-lactone-toolkit` CLI 为准。
|
`scripts/` 目录保留为薄封装或迁移提示,不再承载核心实现。正式接口以 `macro_lactone_toolkit.*` 与 `macro-lactone-toolkit` CLI 为准。
|
||||||
|
|||||||
@@ -50,8 +50,14 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--id-col",
|
"--id-col",
|
||||||
type=str,
|
type=str,
|
||||||
|
default="ml_id",
|
||||||
|
help="ID column name (default: ml_id)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--chembl-id-col",
|
||||||
|
type=str,
|
||||||
default="IDs",
|
default="IDs",
|
||||||
help="ID column name",
|
help="CHEMBL ID column name (default: IDs)",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -69,6 +75,7 @@ def main():
|
|||||||
sample_ratio=args.sample_ratio,
|
sample_ratio=args.sample_ratio,
|
||||||
smiles_col=args.smiles_col,
|
smiles_col=args.smiles_col,
|
||||||
id_col=args.id_col,
|
id_col=args.id_col,
|
||||||
|
chembl_id_col=args.chembl_id_col,
|
||||||
)
|
)
|
||||||
|
|
||||||
results = validator.run(args.input)
|
results = validator.run(args.input)
|
||||||
|
|||||||
@@ -272,6 +272,44 @@ def collect_side_chain_atoms(
|
|||||||
return side_chain_atoms
|
return side_chain_atoms
|
||||||
|
|
||||||
|
|
||||||
|
def find_side_chain_ring_connections(
|
||||||
|
mol: Chem.Mol,
|
||||||
|
side_chain_atoms: Iterable[int],
|
||||||
|
ring_atom_indices: Iterable[int],
|
||||||
|
) -> list[tuple[int, int]]:
|
||||||
|
ring_atom_set = set(ring_atom_indices)
|
||||||
|
connections: set[tuple[int, int]] = set()
|
||||||
|
|
||||||
|
for atom_idx in side_chain_atoms:
|
||||||
|
atom = mol.GetAtomWithIdx(atom_idx)
|
||||||
|
for neighbor in atom.GetNeighbors():
|
||||||
|
neighbor_idx = neighbor.GetIdx()
|
||||||
|
if neighbor_idx in ring_atom_set:
|
||||||
|
connections.add((atom_idx, neighbor_idx))
|
||||||
|
|
||||||
|
return sorted(connections, key=lambda connection: (connection[1], connection[0]))
|
||||||
|
|
||||||
|
|
||||||
|
def collect_fragmentable_side_chain_atoms(
|
||||||
|
mol: Chem.Mol,
|
||||||
|
start_atom_idx: int,
|
||||||
|
ring_atom_indices: Iterable[int],
|
||||||
|
ring_atom_idx: int | None = None,
|
||||||
|
) -> list[int] | None:
|
||||||
|
side_chain_atoms = collect_side_chain_atoms(mol, start_atom_idx, ring_atom_indices)
|
||||||
|
if not side_chain_atoms:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ring_connections = find_side_chain_ring_connections(mol, side_chain_atoms, ring_atom_indices)
|
||||||
|
if len(ring_connections) != 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if ring_atom_idx is not None and ring_connections[0][1] != ring_atom_idx:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return side_chain_atoms
|
||||||
|
|
||||||
|
|
||||||
def is_intrinsic_lactone_neighbor(
|
def is_intrinsic_lactone_neighbor(
|
||||||
mol: Chem.Mol,
|
mol: Chem.Mol,
|
||||||
candidate: DetectedMacrolactone,
|
candidate: DetectedMacrolactone,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from rdkit.Chem import Descriptors
|
|||||||
from ._core import (
|
from ._core import (
|
||||||
build_fragment_smiles,
|
build_fragment_smiles,
|
||||||
build_numbering_result,
|
build_numbering_result,
|
||||||
collect_side_chain_atoms,
|
collect_fragmentable_side_chain_atoms,
|
||||||
ensure_mol,
|
ensure_mol,
|
||||||
find_macrolactone_candidates,
|
find_macrolactone_candidates,
|
||||||
is_intrinsic_lactone_neighbor,
|
is_intrinsic_lactone_neighbor,
|
||||||
@@ -44,6 +44,8 @@ class MacrolactoneFragmenter:
|
|||||||
fragments: list[SideChainFragment] = []
|
fragments: list[SideChainFragment] = []
|
||||||
|
|
||||||
for position, ring_atom_idx in numbering.position_to_atom.items():
|
for position, ring_atom_idx in numbering.position_to_atom.items():
|
||||||
|
if int(position) <= 2:
|
||||||
|
continue
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
neighbor_idx = neighbor.GetIdx()
|
neighbor_idx = neighbor.GetIdx()
|
||||||
@@ -52,8 +54,13 @@ class MacrolactoneFragmenter:
|
|||||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
if not side_chain_atoms:
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from typing import Iterable
|
|||||||
|
|
||||||
from rdkit import Chem
|
from rdkit import Chem
|
||||||
|
|
||||||
from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
|
from .._core import collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
|
||||||
from ..fragmenter import MacrolactoneFragmenter
|
from ..fragmenter import MacrolactoneFragmenter
|
||||||
|
|
||||||
|
|
||||||
@@ -26,22 +26,36 @@ def prepare_macrolactone_scaffold(
|
|||||||
for position in positions:
|
for position in positions:
|
||||||
if position not in numbering.position_to_atom:
|
if position not in numbering.position_to_atom:
|
||||||
raise ValueError(f"Position {position} not found in ring numbering.")
|
raise ValueError(f"Position {position} not found in ring numbering.")
|
||||||
|
if position <= 2:
|
||||||
|
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
|
||||||
ring_atom_idx = numbering.position_to_atom[position]
|
ring_atom_idx = numbering.position_to_atom[position]
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
|
position_dummy_specs: list[tuple[int, int, Chem.BondType]] = []
|
||||||
|
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
neighbor_idx = neighbor.GetIdx()
|
neighbor_idx = neighbor.GetIdx()
|
||||||
if neighbor_idx in ring_atom_set:
|
if neighbor_idx in ring_atom_set:
|
||||||
continue
|
continue
|
||||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||||
continue
|
continue
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
|
continue
|
||||||
atoms_to_remove.update(side_chain_atoms)
|
atoms_to_remove.update(side_chain_atoms)
|
||||||
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
||||||
if bond is not None:
|
if bond is not None:
|
||||||
dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
|
position_dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
|
||||||
|
|
||||||
if not any(spec_position == position for _, spec_position, _ in dummy_specs):
|
if not position_dummy_specs:
|
||||||
dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE))
|
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
|
||||||
|
if len(position_dummy_specs) > 1:
|
||||||
|
raise ValueError(f"Position {position} contains multiple fragmentable side chains")
|
||||||
|
dummy_specs.extend(position_dummy_specs)
|
||||||
|
|
||||||
rwmol = Chem.RWMol(mol)
|
rwmol = Chem.RWMol(mol)
|
||||||
for ring_atom_idx, position, bond_type in dummy_specs:
|
for ring_atom_idx, position, bond_type in dummy_specs:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import UTC, datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
@@ -27,7 +27,8 @@ class ParentMolecule(SQLModel, table=True):
|
|||||||
__tablename__ = "parent_molecules"
|
__tablename__ = "parent_molecules"
|
||||||
|
|
||||||
id: Optional[int] = Field(default=None, primary_key=True)
|
id: Optional[int] = Field(default=None, primary_key=True)
|
||||||
source_id: str = Field(index=True)
|
ml_id: str = Field(index=True) # MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
chembl_id: Optional[str] = Field(default=None, index=True) # Original CHEMBL ID
|
||||||
molecule_name: Optional[str] = None
|
molecule_name: Optional[str] = None
|
||||||
smiles: str = Field(index=True)
|
smiles: str = Field(index=True)
|
||||||
classification: str = Field(index=True)
|
classification: str = Field(index=True)
|
||||||
@@ -39,7 +40,7 @@ class ParentMolecule(SQLModel, table=True):
|
|||||||
num_sidechains: Optional[int] = None
|
num_sidechains: Optional[int] = None
|
||||||
cleavage_positions: Optional[str] = None
|
cleavage_positions: Optional[str] = None
|
||||||
numbered_image_path: Optional[str] = None
|
numbered_image_path: Optional[str] = None
|
||||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||||
processed_at: Optional[datetime] = None
|
processed_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +72,8 @@ class SideChainFragment(SQLModel, table=True):
|
|||||||
fragment_smiles_labeled: str
|
fragment_smiles_labeled: str
|
||||||
fragment_smiles_plain: str
|
fragment_smiles_plain: str
|
||||||
dummy_isotope: int
|
dummy_isotope: int
|
||||||
|
has_dummy_atom: bool = Field(default=True)
|
||||||
|
dummy_atom_count: int = Field(default=1)
|
||||||
atom_count: int
|
atom_count: int
|
||||||
heavy_atom_count: int
|
heavy_atom_count: int
|
||||||
molecular_weight: float
|
molecular_weight: float
|
||||||
@@ -78,6 +81,26 @@ class SideChainFragment(SQLModel, table=True):
|
|||||||
image_path: Optional[str] = None
|
image_path: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class FragmentLibraryEntry(SQLModel, table=True):
|
||||||
|
"""Unified fragment library entries."""
|
||||||
|
|
||||||
|
__tablename__ = "fragment_library_entries"
|
||||||
|
|
||||||
|
id: Optional[int] = Field(default=None, primary_key=True)
|
||||||
|
source_type: str = Field(index=True)
|
||||||
|
source_fragment_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
source_parent_ml_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
source_parent_chembl_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
cleavage_position: Optional[int] = Field(default=None, index=True)
|
||||||
|
fragment_smiles_labeled: Optional[str] = None
|
||||||
|
fragment_smiles_plain: str
|
||||||
|
has_dummy_atom: bool = Field(default=False)
|
||||||
|
dummy_atom_count: int = Field(default=0)
|
||||||
|
splice_ready: bool = Field(default=False, index=True)
|
||||||
|
original_bond_type: Optional[str] = None
|
||||||
|
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||||
|
|
||||||
|
|
||||||
class ValidationResult(SQLModel, table=True):
|
class ValidationResult(SQLModel, table=True):
|
||||||
"""Manual validation records."""
|
"""Manual validation records."""
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -12,7 +12,7 @@ from sqlmodel import select
|
|||||||
from macro_lactone_toolkit import MacroLactoneAnalyzer
|
from macro_lactone_toolkit import MacroLactoneAnalyzer
|
||||||
from macro_lactone_toolkit._core import (
|
from macro_lactone_toolkit._core import (
|
||||||
build_numbering_result,
|
build_numbering_result,
|
||||||
collect_side_chain_atoms,
|
collect_fragmentable_side_chain_atoms,
|
||||||
find_macrolactone_candidates,
|
find_macrolactone_candidates,
|
||||||
is_intrinsic_lactone_neighbor,
|
is_intrinsic_lactone_neighbor,
|
||||||
)
|
)
|
||||||
@@ -20,6 +20,7 @@ from macro_lactone_toolkit.validation.database import get_engine, get_session, i
|
|||||||
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
|
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
|
||||||
from macro_lactone_toolkit.validation.models import (
|
from macro_lactone_toolkit.validation.models import (
|
||||||
ClassificationType,
|
ClassificationType,
|
||||||
|
FragmentLibraryEntry,
|
||||||
ParentMolecule,
|
ParentMolecule,
|
||||||
ProcessingStatus,
|
ProcessingStatus,
|
||||||
RingNumbering,
|
RingNumbering,
|
||||||
@@ -41,12 +42,14 @@ class MacrolactoneValidator:
|
|||||||
output_dir: str | Path,
|
output_dir: str | Path,
|
||||||
sample_ratio: float = 0.1,
|
sample_ratio: float = 0.1,
|
||||||
smiles_col: str = "smiles",
|
smiles_col: str = "smiles",
|
||||||
id_col: str = "IDs",
|
id_col: str = "ml_id",
|
||||||
|
chembl_id_col: str = "IDs",
|
||||||
):
|
):
|
||||||
self.output_dir = Path(output_dir)
|
self.output_dir = Path(output_dir)
|
||||||
self.sample_ratio = sample_ratio
|
self.sample_ratio = sample_ratio
|
||||||
self.smiles_col = smiles_col
|
self.smiles_col = smiles_col
|
||||||
self.id_col = id_col
|
self.id_col = id_col
|
||||||
|
self.chembl_id_col = chembl_id_col
|
||||||
|
|
||||||
self.analyzer = MacroLactoneAnalyzer()
|
self.analyzer = MacroLactoneAnalyzer()
|
||||||
|
|
||||||
@@ -78,12 +81,14 @@ class MacrolactoneValidator:
|
|||||||
# Generate outputs
|
# Generate outputs
|
||||||
self._generate_readme()
|
self._generate_readme()
|
||||||
self._generate_summary()
|
self._generate_summary()
|
||||||
|
self._generate_fragment_library()
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _process_molecule(self, row: pd.Series) -> str:
|
def _process_molecule(self, row: pd.Series) -> str:
|
||||||
"""Process a single molecule. Returns status."""
|
"""Process a single molecule. Returns status."""
|
||||||
source_id = str(row[self.id_col])
|
ml_id = str(row[self.id_col])
|
||||||
|
chembl_id = str(row[self.chembl_id_col]) if self.chembl_id_col in row and pd.notna(row[self.chembl_id_col]) else None
|
||||||
smiles = row[self.smiles_col]
|
smiles = row[self.smiles_col]
|
||||||
name = row.get("molecule_pref_name", None)
|
name = row.get("molecule_pref_name", None)
|
||||||
|
|
||||||
@@ -105,7 +110,8 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Create parent record
|
# Create parent record
|
||||||
parent = ParentMolecule(
|
parent = ParentMolecule(
|
||||||
source_id=source_id,
|
ml_id=ml_id,
|
||||||
|
chembl_id=chembl_id,
|
||||||
molecule_name=name,
|
molecule_name=name,
|
||||||
smiles=smiles,
|
smiles=smiles,
|
||||||
classification=classification,
|
classification=classification,
|
||||||
@@ -124,7 +130,7 @@ class MacrolactoneValidator:
|
|||||||
parent.processing_status = ProcessingStatus.SKIPPED
|
parent.processing_status = ProcessingStatus.SKIPPED
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
self._save_original_image(smiles, source_id, ring_size, classification)
|
self._save_original_image(smiles, ml_id, ring_size, classification)
|
||||||
return "skipped"
|
return "skipped"
|
||||||
|
|
||||||
# Process standard macrolactone
|
# Process standard macrolactone
|
||||||
@@ -134,7 +140,7 @@ class MacrolactoneValidator:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
parent.processing_status = ProcessingStatus.FAILED
|
parent.processing_status = ProcessingStatus.FAILED
|
||||||
parent.error_message = str(e)
|
parent.error_message = str(e)
|
||||||
parent.processed_at = datetime.utcnow()
|
parent.processed_at = datetime.now(UTC)
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
return "failed"
|
return "failed"
|
||||||
@@ -172,7 +178,7 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Save numbered image
|
# Save numbered image
|
||||||
paths = get_output_paths(
|
paths = get_output_paths(
|
||||||
self.output_dir, parent.source_id, parent.ring_size, "standard_macrolactone"
|
self.output_dir, parent.ml_id, parent.ring_size, "standard_macrolactone"
|
||||||
)
|
)
|
||||||
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
||||||
if image_path:
|
if image_path:
|
||||||
@@ -184,6 +190,8 @@ class MacrolactoneValidator:
|
|||||||
fragment_idx = 0
|
fragment_idx = 0
|
||||||
|
|
||||||
for position, ring_atom_idx in numbering.position_to_atom.items():
|
for position, ring_atom_idx in numbering.position_to_atom.items():
|
||||||
|
if int(position) <= 2:
|
||||||
|
continue
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
|
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
@@ -196,8 +204,13 @@ class MacrolactoneValidator:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Collect side chain atoms
|
# Collect side chain atoms
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
if not side_chain_atoms:
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Build fragment with isotope tagging
|
# Build fragment with isotope tagging
|
||||||
@@ -217,13 +230,15 @@ class MacrolactoneValidator:
|
|||||||
# Create fragment record
|
# Create fragment record
|
||||||
fragment = SideChainFragment(
|
fragment = SideChainFragment(
|
||||||
parent_id=parent.id,
|
parent_id=parent.id,
|
||||||
fragment_id=f"{parent.source_id}_frag_{fragment_idx}",
|
fragment_id=f"{parent.ml_id}_frag_{fragment_idx}",
|
||||||
cleavage_position=int(position),
|
cleavage_position=int(position),
|
||||||
attachment_atom_idx=ring_atom_idx,
|
attachment_atom_idx=ring_atom_idx,
|
||||||
attachment_atom_symbol=ring_atom.GetSymbol(),
|
attachment_atom_symbol=ring_atom.GetSymbol(),
|
||||||
fragment_smiles_labeled=labeled_smiles,
|
fragment_smiles_labeled=labeled_smiles,
|
||||||
fragment_smiles_plain=plain_smiles,
|
fragment_smiles_plain=plain_smiles,
|
||||||
dummy_isotope=int(position),
|
dummy_isotope=int(position),
|
||||||
|
has_dummy_atom=True,
|
||||||
|
dummy_atom_count=1,
|
||||||
atom_count=atom_count,
|
atom_count=atom_count,
|
||||||
heavy_atom_count=heavy_atom_count,
|
heavy_atom_count=heavy_atom_count,
|
||||||
molecular_weight=round(mw, 4),
|
molecular_weight=round(mw, 4),
|
||||||
@@ -231,11 +246,26 @@ class MacrolactoneValidator:
|
|||||||
)
|
)
|
||||||
session.add(fragment)
|
session.add(fragment)
|
||||||
fragments.append(fragment)
|
fragments.append(fragment)
|
||||||
|
session.add(
|
||||||
|
FragmentLibraryEntry(
|
||||||
|
source_type="validation_extract",
|
||||||
|
source_fragment_id=fragment.fragment_id,
|
||||||
|
source_parent_ml_id=parent.ml_id,
|
||||||
|
source_parent_chembl_id=parent.chembl_id,
|
||||||
|
cleavage_position=int(position),
|
||||||
|
fragment_smiles_labeled=labeled_smiles,
|
||||||
|
fragment_smiles_plain=plain_smiles,
|
||||||
|
has_dummy_atom=True,
|
||||||
|
dummy_atom_count=1,
|
||||||
|
splice_ready=True,
|
||||||
|
original_bond_type=bond_type,
|
||||||
|
)
|
||||||
|
)
|
||||||
fragment_idx += 1
|
fragment_idx += 1
|
||||||
|
|
||||||
# Save fragment images
|
# Save fragment images
|
||||||
if fragments and paths["sidechains_dir"]:
|
if fragments and paths["sidechains_dir"]:
|
||||||
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.source_id)
|
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.ml_id)
|
||||||
for frag, img_path in zip(fragments, image_paths):
|
for frag, img_path in zip(fragments, image_paths):
|
||||||
frag.image_path = img_path
|
frag.image_path = img_path
|
||||||
session.add(frag)
|
session.add(frag)
|
||||||
@@ -244,13 +274,13 @@ class MacrolactoneValidator:
|
|||||||
parent.processing_status = ProcessingStatus.SUCCESS
|
parent.processing_status = ProcessingStatus.SUCCESS
|
||||||
parent.num_sidechains = len(fragments)
|
parent.num_sidechains = len(fragments)
|
||||||
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
|
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
|
||||||
parent.processed_at = datetime.utcnow()
|
parent.processed_at = datetime.now(UTC)
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
def _save_original_image(self, smiles: str, source_id: str, ring_size: int, classification: str):
|
def _save_original_image(self, smiles: str, ml_id: str, ring_size: int, classification: str):
|
||||||
"""Save original image for non-standard molecules."""
|
"""Save original image for non-standard molecules."""
|
||||||
paths = get_output_paths(self.output_dir, source_id, ring_size, classification)
|
paths = get_output_paths(self.output_dir, ml_id, ring_size, classification)
|
||||||
try:
|
try:
|
||||||
from rdkit.Chem import Draw
|
from rdkit.Chem import Draw
|
||||||
|
|
||||||
@@ -272,6 +302,7 @@ This directory contains validation results for MacrolactoneDB 12-20 membered rin
|
|||||||
validation_output/
|
validation_output/
|
||||||
├── README.md # This file
|
├── README.md # This file
|
||||||
├── fragments.db # SQLite database with all data
|
├── fragments.db # SQLite database with all data
|
||||||
|
├── fragment_library.csv # Unified fragment library export
|
||||||
├── summary.csv # Summary of all processed molecules
|
├── summary.csv # Summary of all processed molecules
|
||||||
├── summary_statistics.json # Statistical summary
|
├── summary_statistics.json # Statistical summary
|
||||||
│
|
│
|
||||||
@@ -301,6 +332,7 @@ validation_output/
|
|||||||
- **parent_molecules**: Original molecule information
|
- **parent_molecules**: Original molecule information
|
||||||
- **ring_numberings**: Ring atom numbering details
|
- **ring_numberings**: Ring atom numbering details
|
||||||
- **side_chain_fragments**: Fragmentation results with isotope tags
|
- **side_chain_fragments**: Fragmentation results with isotope tags
|
||||||
|
- **fragment_library_entries**: Unified fragment library rows for downstream design
|
||||||
- **validation_results**: Manual validation records
|
- **validation_results**: Manual validation records
|
||||||
|
|
||||||
### Key Fields
|
### Key Fields
|
||||||
@@ -308,6 +340,8 @@ validation_output/
|
|||||||
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
||||||
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
||||||
- `cleavage_position`: Position on ring where side chain was attached
|
- `cleavage_position`: Position on ring where side chain was attached
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms in the fragment
|
||||||
|
|
||||||
## Ring Numbering Convention
|
## Ring Numbering Convention
|
||||||
|
|
||||||
@@ -325,13 +359,21 @@ Fragments use isotope values to mark cleavage position:
|
|||||||
|
|
||||||
### summary.csv
|
### summary.csv
|
||||||
|
|
||||||
- `source_id`: Original molecule ID from MacrolactoneDB
|
- `ml_id`: MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
- `chembl_id`: Original CHEMBL ID (if available)
|
||||||
- `classification`: Classification result
|
- `classification`: Classification result
|
||||||
- `ring_size`: Detected ring size (12-20)
|
- `ring_size`: Detected ring size (12-20)
|
||||||
- `num_sidechains`: Number of side chains detected
|
- `num_sidechains`: Number of side chains detected
|
||||||
- `cleavage_positions`: JSON array of cleavage positions
|
- `cleavage_positions`: JSON array of cleavage positions
|
||||||
- `processing_status`: pending | success | failed | skipped
|
- `processing_status`: pending | success | failed | skipped
|
||||||
|
|
||||||
|
### fragment_library.csv
|
||||||
|
|
||||||
|
- `source_type`: validation_extract | supplemental (reserved)
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms
|
||||||
|
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
|
||||||
|
|
||||||
## Querying the Database
|
## Querying the Database
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -363,7 +405,8 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
|
|||||||
for p in parents:
|
for p in parents:
|
||||||
data.append({
|
data.append({
|
||||||
"id": p.id,
|
"id": p.id,
|
||||||
"source_id": p.source_id,
|
"ml_id": p.ml_id,
|
||||||
|
"chembl_id": p.chembl_id,
|
||||||
"molecule_name": p.molecule_name,
|
"molecule_name": p.molecule_name,
|
||||||
"smiles": p.smiles,
|
"smiles": p.smiles,
|
||||||
"classification": p.classification,
|
"classification": p.classification,
|
||||||
@@ -395,6 +438,47 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
|
|||||||
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
|
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
|
||||||
print(f"Statistics: {stats}")
|
print(f"Statistics: {stats}")
|
||||||
|
|
||||||
|
def _generate_fragment_library(self):
|
||||||
|
"""Generate unified fragment library CSV."""
|
||||||
|
columns = [
|
||||||
|
"id",
|
||||||
|
"source_type",
|
||||||
|
"source_fragment_id",
|
||||||
|
"source_parent_ml_id",
|
||||||
|
"source_parent_chembl_id",
|
||||||
|
"cleavage_position",
|
||||||
|
"fragment_smiles_labeled",
|
||||||
|
"fragment_smiles_plain",
|
||||||
|
"has_dummy_atom",
|
||||||
|
"dummy_atom_count",
|
||||||
|
"splice_ready",
|
||||||
|
"original_bond_type",
|
||||||
|
"created_at",
|
||||||
|
]
|
||||||
|
|
||||||
|
with get_session(self.engine) as session:
|
||||||
|
entries = session.exec(select(FragmentLibraryEntry)).all()
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"id": entry.id,
|
||||||
|
"source_type": entry.source_type,
|
||||||
|
"source_fragment_id": entry.source_fragment_id,
|
||||||
|
"source_parent_ml_id": entry.source_parent_ml_id,
|
||||||
|
"source_parent_chembl_id": entry.source_parent_chembl_id,
|
||||||
|
"cleavage_position": entry.cleavage_position,
|
||||||
|
"fragment_smiles_labeled": entry.fragment_smiles_labeled,
|
||||||
|
"fragment_smiles_plain": entry.fragment_smiles_plain,
|
||||||
|
"has_dummy_atom": entry.has_dummy_atom,
|
||||||
|
"dummy_atom_count": entry.dummy_atom_count,
|
||||||
|
"splice_ready": entry.splice_ready,
|
||||||
|
"original_bond_type": entry.original_bond_type,
|
||||||
|
"created_at": entry.created_at,
|
||||||
|
}
|
||||||
|
for entry in entries
|
||||||
|
]
|
||||||
|
|
||||||
|
pd.DataFrame(data, columns=columns).to_csv(self.output_dir / "fragment_library.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
class MacrolactoneDetectionError(Exception):
|
class MacrolactoneDetectionError(Exception):
|
||||||
"""Raised when macrolactone detection fails."""
|
"""Raised when macrolactone detection fails."""
|
||||||
|
|||||||
102
tests/helpers.py
102
tests/helpers.py
@@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_fused_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
fused_positions: tuple[int, int] = (5, 6),
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
position_a, position_b = fused_positions
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_bridge_side_chain(
|
||||||
|
ring_size: int = 16,
|
||||||
|
bridge_positions: tuple[int, int] = (5, 8),
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
position_a, position_b = bridge_positions
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_shared_atom_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
position: int = 5,
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
ring_atom_idx = base.position_to_atom[position]
|
||||||
|
|
||||||
|
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_single_anchor_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
position: int = 5,
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
ring_atom_idx = base.position_to_atom[position]
|
||||||
|
|
||||||
|
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
|
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
|
||||||
rwmol = Chem.RWMol()
|
rwmol = Chem.RWMol()
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,12 @@ from rdkit import Chem
|
|||||||
|
|
||||||
from macro_lactone_toolkit import MacrolactoneFragmenter
|
from macro_lactone_toolkit import MacrolactoneFragmenter
|
||||||
|
|
||||||
from .helpers import build_macrolactone
|
from .helpers import (
|
||||||
|
build_macrolactone,
|
||||||
|
build_macrolactone_with_fused_side_ring,
|
||||||
|
build_macrolactone_with_shared_atom_side_ring,
|
||||||
|
build_macrolactone_with_single_anchor_side_ring,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_fragmentation_returns_empty_list_without_sidechains():
|
def test_fragmentation_returns_empty_list_without_sidechains():
|
||||||
@@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type():
|
|||||||
neighbor = dummy_atom.GetNeighbors()[0]
|
neighbor = dummy_atom.GetNeighbors()[0]
|
||||||
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
|
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
|
||||||
assert bond.GetBondType() == Chem.BondType.DOUBLE
|
assert bond.GetBondType() == Chem.BondType.DOUBLE
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains():
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {10}
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_skips_shared_atom_multi_anchor_component():
|
||||||
|
built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"})
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {11}
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_allows_single_anchor_side_ring():
|
||||||
|
built = build_macrolactone_with_single_anchor_side_ring()
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {5}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter
|
|||||||
from macro_lactone_toolkit.splicing.engine import splice_molecule
|
from macro_lactone_toolkit.splicing.engine import splice_molecule
|
||||||
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
|
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
|
||||||
|
|
||||||
from .helpers import build_macrolactone, canonicalize
|
from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize
|
||||||
|
|
||||||
|
|
||||||
def test_splice_benzene_methyl():
|
def test_splice_benzene_methyl():
|
||||||
@@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment():
|
|||||||
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
|
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
|
||||||
|
|
||||||
assert canonicalize(product) == canonicalize(built.mol)
|
assert canonicalize(product) == canonicalize(built.mol)
|
||||||
|
|
||||||
|
|
||||||
|
def test_prepare_scaffold_rejects_position_without_single_anchor_fragment():
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"):
|
||||||
|
prepare_macrolactone_scaffold(
|
||||||
|
built.smiles,
|
||||||
|
positions=[5],
|
||||||
|
ring_size=16,
|
||||||
|
)
|
||||||
|
|||||||
57
tests/validation/test_validator.py
Normal file
57
tests/validation/test_validator.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
|
||||||
|
|
||||||
|
from ..helpers import build_macrolactone_with_fused_side_ring
|
||||||
|
|
||||||
|
|
||||||
|
def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path):
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
input_path = tmp_path / "input.csv"
|
||||||
|
output_dir = tmp_path / "validation_output"
|
||||||
|
|
||||||
|
pd.DataFrame(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"ml_id": "ML00000001",
|
||||||
|
"IDs": "CHEMBL0001",
|
||||||
|
"smiles": built.smiles,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
).to_csv(input_path, index=False)
|
||||||
|
|
||||||
|
validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0)
|
||||||
|
results = validator.run(input_path)
|
||||||
|
|
||||||
|
assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0}
|
||||||
|
|
||||||
|
with sqlite3.connect(output_dir / "fragments.db") as connection:
|
||||||
|
fragments = connection.execute(
|
||||||
|
"SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments"
|
||||||
|
).fetchall()
|
||||||
|
library_entries = connection.execute(
|
||||||
|
"""
|
||||||
|
SELECT source_type, source_parent_ml_id, source_parent_chembl_id,
|
||||||
|
cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready
|
||||||
|
FROM fragment_library_entries
|
||||||
|
"""
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
assert fragments == [(10, 1, 1)]
|
||||||
|
assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)]
|
||||||
|
|
||||||
|
summary = pd.read_csv(output_dir / "summary.csv")
|
||||||
|
assert summary.loc[0, "num_sidechains"] == 1
|
||||||
|
assert json.loads(summary.loc[0, "cleavage_positions"]) == [10]
|
||||||
|
|
||||||
|
fragment_library = pd.read_csv(output_dir / "fragment_library.csv")
|
||||||
|
assert fragment_library.loc[0, "source_type"] == "validation_extract"
|
||||||
|
assert int(fragment_library.loc[0, "cleavage_position"]) == 10
|
||||||
|
assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True
|
||||||
|
assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1
|
||||||
|
assert bool(fragment_library.loc[0, "splice_ready"]) is True
|
||||||
97
validation_output/README.md
Normal file
97
validation_output/README.md
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# MacrolactoneDB Validation Output
|
||||||
|
|
||||||
|
This directory contains validation results for MacrolactoneDB 12-20 membered rings.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
validation_output/
|
||||||
|
├── README.md # This file
|
||||||
|
├── fragments.db # SQLite database with all data
|
||||||
|
├── fragment_library.csv # Unified fragment library export
|
||||||
|
├── summary.csv # Summary of all processed molecules
|
||||||
|
├── summary_statistics.json # Statistical summary
|
||||||
|
│
|
||||||
|
├── ring_size_12/ # 12-membered rings
|
||||||
|
├── ring_size_13/ # 13-membered rings
|
||||||
|
...
|
||||||
|
└── ring_size_20/ # 20-membered rings
|
||||||
|
├── molecules.csv # Molecules in this ring size
|
||||||
|
├── standard/ # Standard macrolactones
|
||||||
|
│ ├── numbered/ # Numbered ring images
|
||||||
|
│ │ └── {id}_numbered.png
|
||||||
|
│ └── sidechains/ # Fragment images
|
||||||
|
│ └── {id}/
|
||||||
|
│ └── {id}_frag_{n}_pos{pos}.png
|
||||||
|
├── non_standard/ # Non-standard macrocycles
|
||||||
|
│ └── original/
|
||||||
|
│ └── {id}_original.png
|
||||||
|
└── rejected/ # Not macrolactones
|
||||||
|
└── original/
|
||||||
|
└── {id}_original.png
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
### Tables
|
||||||
|
|
||||||
|
- **parent_molecules**: Original molecule information
|
||||||
|
- **ring_numberings**: Ring atom numbering details
|
||||||
|
- **side_chain_fragments**: Fragmentation results with isotope tags
|
||||||
|
- **fragment_library_entries**: Unified fragment library rows for downstream design
|
||||||
|
- **validation_results**: Manual validation records
|
||||||
|
|
||||||
|
### Key Fields
|
||||||
|
|
||||||
|
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
||||||
|
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
||||||
|
- `cleavage_position`: Position on ring where side chain was attached
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms in the fragment
|
||||||
|
|
||||||
|
## Ring Numbering Convention
|
||||||
|
|
||||||
|
1. Position 1 = Lactone carbonyl carbon (C=O)
|
||||||
|
2. Position 2 = Ester oxygen (-O-)
|
||||||
|
3. Positions 3-N = Sequential around ring
|
||||||
|
|
||||||
|
## Isotope Tagging
|
||||||
|
|
||||||
|
Fragments use isotope values to mark cleavage position:
|
||||||
|
- `[5*]CCO` = Fragment from position 5, dummy atom has isotope=5
|
||||||
|
- This enables precise reconstruction during reassembly
|
||||||
|
|
||||||
|
## CSV Columns
|
||||||
|
|
||||||
|
### summary.csv
|
||||||
|
|
||||||
|
- `ml_id`: MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
- `chembl_id`: Original CHEMBL ID (if available)
|
||||||
|
- `classification`: Classification result
|
||||||
|
- `ring_size`: Detected ring size (12-20)
|
||||||
|
- `num_sidechains`: Number of side chains detected
|
||||||
|
- `cleavage_positions`: JSON array of cleavage positions
|
||||||
|
- `processing_status`: pending | success | failed | skipped
|
||||||
|
|
||||||
|
### fragment_library.csv
|
||||||
|
|
||||||
|
- `source_type`: validation_extract | supplemental (reserved)
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms
|
||||||
|
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
|
||||||
|
|
||||||
|
## Querying the Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List tables
|
||||||
|
sqlite3 fragments.db ".tables"
|
||||||
|
|
||||||
|
# Get standard macrolactones with fragments
|
||||||
|
sqlite3 fragments.db "SELECT * FROM parent_molecules WHERE classification='standard_macrolactone' LIMIT 5;"
|
||||||
|
|
||||||
|
# Get fragments for a specific molecule
|
||||||
|
sqlite3 fragments.db "SELECT * FROM side_chain_fragments WHERE parent_id=1;"
|
||||||
|
|
||||||
|
# Count by ring size
|
||||||
|
sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
|
||||||
|
```
|
||||||
6280
validation_output/fragment_library_filter_gt3.csv
Normal file
6280
validation_output/fragment_library_filter_gt3.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
validation_output/fragments.db
Normal file
BIN
validation_output/fragments.db
Normal file
Binary file not shown.
11037
validation_output/summary.csv
Normal file
11037
validation_output/summary.csv
Normal file
File diff suppressed because it is too large
Load Diff
23
validation_output/summary_statistics.json
Normal file
23
validation_output/summary_statistics.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"total_molecules": 11036,
|
||||||
|
"by_classification": {
|
||||||
|
"non_standard_macrocycle": 6336,
|
||||||
|
"standard_macrolactone": 4482,
|
||||||
|
"not_macrolactone": 218
|
||||||
|
},
|
||||||
|
"by_ring_size": {
|
||||||
|
"14.0": 3017,
|
||||||
|
"16.0": 1879,
|
||||||
|
"15.0": 1613,
|
||||||
|
"12.0": 1419,
|
||||||
|
"19.0": 855,
|
||||||
|
"18.0": 809,
|
||||||
|
"13.0": 679,
|
||||||
|
"20.0": 243,
|
||||||
|
"17.0": 196
|
||||||
|
},
|
||||||
|
"by_status": {
|
||||||
|
"skipped": 6554,
|
||||||
|
"success": 4482
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user