feat(validation): enforce single-anchor fragments

- skip fused/shared/multi-anchor side systems during extraction
- add fragment library schema and fragment_library.csv export
- make scaffold prep strict for non-spliceable positions
This commit is contained in:
2026-03-19 14:20:32 +08:00
parent 07ba27be2b
commit 46a438dd36
10 changed files with 383 additions and 21 deletions

View File

@@ -96,6 +96,7 @@ pixi run python scripts/validate_macrolactone_db.py \
validation_output/
├── README.md # 目录说明
├── fragments.db # SQLite 数据库
├── fragment_library.csv # 最终片段库导出(含 has_dummy_atom / splice_ready
├── summary.csv # 汇总表(含 ml_id, chembl_id
├── summary_statistics.json # 统计信息
├── ring_size_12/ # 按环大小组织
@@ -124,11 +125,15 @@ sqlite3 validation_output/fragments.db \
FROM parent_molecules \
WHERE classification='standard_macrolactone' LIMIT 5;"
# 查询最终片段库
sqlite3 validation_output/fragments.db \
"SELECT source_type, source_parent_ml_id, cleavage_position, has_dummy_atom, splice_ready \
FROM fragment_library_entries LIMIT 10;"
# 查询片段
sqlite3 validation_output/fragments.db \
"SELECT fragment_id, cleavage_position, dummy_isotope \
FROM side_chain_fragments \
WHERE ml_id='ML00000001';"
"SELECT fragment_id, cleavage_position, dummy_isotope, has_dummy_atom, dummy_atom_count \
FROM side_chain_fragments LIMIT 10;"
# 按环大小统计
sqlite3 validation_output/fragments.db \
@@ -144,6 +149,8 @@ sqlite3 validation_output/fragments.db \
| `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone |
| `dummy_isotope` | 裂解位置编号,用于片段重建 |
| `cleavage_position` | 环上的断裂位置 |
| `has_dummy_atom` | 该片段是否带 dummy 原子,可用于区分可直接拼接片段 |
| `splice_ready` | 是否与当前单锚点拼接流程直接兼容 |
## Legacy Scripts

View File

@@ -272,6 +272,44 @@ def collect_side_chain_atoms(
return side_chain_atoms
def find_side_chain_ring_connections(
mol: Chem.Mol,
side_chain_atoms: Iterable[int],
ring_atom_indices: Iterable[int],
) -> list[tuple[int, int]]:
ring_atom_set = set(ring_atom_indices)
connections: set[tuple[int, int]] = set()
for atom_idx in side_chain_atoms:
atom = mol.GetAtomWithIdx(atom_idx)
for neighbor in atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in ring_atom_set:
connections.add((atom_idx, neighbor_idx))
return sorted(connections, key=lambda connection: (connection[1], connection[0]))
def collect_fragmentable_side_chain_atoms(
mol: Chem.Mol,
start_atom_idx: int,
ring_atom_indices: Iterable[int],
ring_atom_idx: int | None = None,
) -> list[int] | None:
side_chain_atoms = collect_side_chain_atoms(mol, start_atom_idx, ring_atom_indices)
if not side_chain_atoms:
return None
ring_connections = find_side_chain_ring_connections(mol, side_chain_atoms, ring_atom_indices)
if len(ring_connections) != 1:
return None
if ring_atom_idx is not None and ring_connections[0][1] != ring_atom_idx:
return None
return side_chain_atoms
def is_intrinsic_lactone_neighbor(
mol: Chem.Mol,
candidate: DetectedMacrolactone,

View File

@@ -6,7 +6,7 @@ from rdkit.Chem import Descriptors
from ._core import (
build_fragment_smiles,
build_numbering_result,
collect_side_chain_atoms,
collect_fragmentable_side_chain_atoms,
ensure_mol,
find_macrolactone_candidates,
is_intrinsic_lactone_neighbor,
@@ -44,6 +44,8 @@ class MacrolactoneFragmenter:
fragments: list[SideChainFragment] = []
for position, ring_atom_idx in numbering.position_to_atom.items():
if int(position) <= 2:
continue
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
for neighbor in ring_atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
@@ -52,8 +54,13 @@ class MacrolactoneFragmenter:
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
continue
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
if not side_chain_atoms:
side_chain_atoms = collect_fragmentable_side_chain_atoms(
mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue
try:

View File

@@ -4,7 +4,7 @@ from typing import Iterable
from rdkit import Chem
from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
from .._core import collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
from ..fragmenter import MacrolactoneFragmenter
@@ -26,22 +26,36 @@ def prepare_macrolactone_scaffold(
for position in positions:
if position not in numbering.position_to_atom:
raise ValueError(f"Position {position} not found in ring numbering.")
if position <= 2:
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
ring_atom_idx = numbering.position_to_atom[position]
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
position_dummy_specs: list[tuple[int, int, Chem.BondType]] = []
for neighbor in ring_atom.GetNeighbors():
neighbor_idx = neighbor.GetIdx()
if neighbor_idx in ring_atom_set:
continue
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
continue
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
side_chain_atoms = collect_fragmentable_side_chain_atoms(
mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue
atoms_to_remove.update(side_chain_atoms)
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
if bond is not None:
dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
position_dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
if not any(spec_position == position for _, spec_position, _ in dummy_specs):
dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE))
if not position_dummy_specs:
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
if len(position_dummy_specs) > 1:
raise ValueError(f"Position {position} contains multiple fragmentable side chains")
dummy_specs.extend(position_dummy_specs)
rwmol = Chem.RWMol(mol)
for ring_atom_idx, position, bond_type in dummy_specs:

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from datetime import datetime
from datetime import UTC, datetime
from typing import List, Optional
from sqlalchemy.orm import Mapped, mapped_column, relationship
@@ -40,7 +40,7 @@ class ParentMolecule(SQLModel, table=True):
num_sidechains: Optional[int] = None
cleavage_positions: Optional[str] = None
numbered_image_path: Optional[str] = None
created_at: datetime = Field(default_factory=datetime.utcnow)
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
processed_at: Optional[datetime] = None
@@ -72,6 +72,8 @@ class SideChainFragment(SQLModel, table=True):
fragment_smiles_labeled: str
fragment_smiles_plain: str
dummy_isotope: int
has_dummy_atom: bool = Field(default=True)
dummy_atom_count: int = Field(default=1)
atom_count: int
heavy_atom_count: int
molecular_weight: float
@@ -79,6 +81,26 @@ class SideChainFragment(SQLModel, table=True):
image_path: Optional[str] = None
class FragmentLibraryEntry(SQLModel, table=True):
"""Unified fragment library entries."""
__tablename__ = "fragment_library_entries"
id: Optional[int] = Field(default=None, primary_key=True)
source_type: str = Field(index=True)
source_fragment_id: Optional[str] = Field(default=None, index=True)
source_parent_ml_id: Optional[str] = Field(default=None, index=True)
source_parent_chembl_id: Optional[str] = Field(default=None, index=True)
cleavage_position: Optional[int] = Field(default=None, index=True)
fragment_smiles_labeled: Optional[str] = None
fragment_smiles_plain: str
has_dummy_atom: bool = Field(default=False)
dummy_atom_count: int = Field(default=0)
splice_ready: bool = Field(default=False, index=True)
original_bond_type: Optional[str] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
class ValidationResult(SQLModel, table=True):
"""Manual validation records."""

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import json
from datetime import datetime
from datetime import UTC, datetime
from pathlib import Path
import pandas as pd
@@ -12,7 +12,7 @@ from sqlmodel import select
from macro_lactone_toolkit import MacroLactoneAnalyzer
from macro_lactone_toolkit._core import (
build_numbering_result,
collect_side_chain_atoms,
collect_fragmentable_side_chain_atoms,
find_macrolactone_candidates,
is_intrinsic_lactone_neighbor,
)
@@ -20,6 +20,7 @@ from macro_lactone_toolkit.validation.database import get_engine, get_session, i
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
from macro_lactone_toolkit.validation.models import (
ClassificationType,
FragmentLibraryEntry,
ParentMolecule,
ProcessingStatus,
RingNumbering,
@@ -80,6 +81,7 @@ class MacrolactoneValidator:
# Generate outputs
self._generate_readme()
self._generate_summary()
self._generate_fragment_library()
return results
@@ -138,7 +140,7 @@ class MacrolactoneValidator:
except Exception as e:
parent.processing_status = ProcessingStatus.FAILED
parent.error_message = str(e)
parent.processed_at = datetime.utcnow()
parent.processed_at = datetime.now(UTC)
session.add(parent)
session.commit()
return "failed"
@@ -188,6 +190,8 @@ class MacrolactoneValidator:
fragment_idx = 0
for position, ring_atom_idx in numbering.position_to_atom.items():
if int(position) <= 2:
continue
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
for neighbor in ring_atom.GetNeighbors():
@@ -200,8 +204,13 @@ class MacrolactoneValidator:
continue
# Collect side chain atoms
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
if not side_chain_atoms:
side_chain_atoms = collect_fragmentable_side_chain_atoms(
mol=mol,
start_atom_idx=neighbor_idx,
ring_atom_indices=ring_atom_set,
ring_atom_idx=ring_atom_idx,
)
if side_chain_atoms is None:
continue
# Build fragment with isotope tagging
@@ -228,6 +237,8 @@ class MacrolactoneValidator:
fragment_smiles_labeled=labeled_smiles,
fragment_smiles_plain=plain_smiles,
dummy_isotope=int(position),
has_dummy_atom=True,
dummy_atom_count=1,
atom_count=atom_count,
heavy_atom_count=heavy_atom_count,
molecular_weight=round(mw, 4),
@@ -235,6 +246,21 @@ class MacrolactoneValidator:
)
session.add(fragment)
fragments.append(fragment)
session.add(
FragmentLibraryEntry(
source_type="validation_extract",
source_fragment_id=fragment.fragment_id,
source_parent_ml_id=parent.ml_id,
source_parent_chembl_id=parent.chembl_id,
cleavage_position=int(position),
fragment_smiles_labeled=labeled_smiles,
fragment_smiles_plain=plain_smiles,
has_dummy_atom=True,
dummy_atom_count=1,
splice_ready=True,
original_bond_type=bond_type,
)
)
fragment_idx += 1
# Save fragment images
@@ -248,7 +274,7 @@ class MacrolactoneValidator:
parent.processing_status = ProcessingStatus.SUCCESS
parent.num_sidechains = len(fragments)
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
parent.processed_at = datetime.utcnow()
parent.processed_at = datetime.now(UTC)
session.add(parent)
session.commit()
@@ -276,6 +302,7 @@ This directory contains validation results for MacrolactoneDB 12-20 membered rin
validation_output/
├── README.md # This file
├── fragments.db # SQLite database with all data
├── fragment_library.csv # Unified fragment library export
├── summary.csv # Summary of all processed molecules
├── summary_statistics.json # Statistical summary
@@ -305,6 +332,7 @@ validation_output/
- **parent_molecules**: Original molecule information
- **ring_numberings**: Ring atom numbering details
- **side_chain_fragments**: Fragmentation results with isotope tags
- **fragment_library_entries**: Unified fragment library rows for downstream design
- **validation_results**: Manual validation records
### Key Fields
@@ -312,6 +340,8 @@ validation_output/
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
- `cleavage_position`: Position on ring where side chain was attached
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
- `dummy_atom_count`: Number of dummy atoms in the fragment
## Ring Numbering Convention
@@ -337,6 +367,13 @@ Fragments use isotope values to mark cleavage position:
- `cleavage_positions`: JSON array of cleavage positions
- `processing_status`: pending | success | failed | skipped
### fragment_library.csv
- `source_type`: validation_extract | supplemental (reserved)
- `has_dummy_atom`: Whether the fragment contains a dummy atom
- `dummy_atom_count`: Number of dummy atoms
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
## Querying the Database
```bash
@@ -401,6 +438,47 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
print(f"Statistics: {stats}")
def _generate_fragment_library(self):
"""Generate unified fragment library CSV."""
columns = [
"id",
"source_type",
"source_fragment_id",
"source_parent_ml_id",
"source_parent_chembl_id",
"cleavage_position",
"fragment_smiles_labeled",
"fragment_smiles_plain",
"has_dummy_atom",
"dummy_atom_count",
"splice_ready",
"original_bond_type",
"created_at",
]
with get_session(self.engine) as session:
entries = session.exec(select(FragmentLibraryEntry)).all()
data = [
{
"id": entry.id,
"source_type": entry.source_type,
"source_fragment_id": entry.source_fragment_id,
"source_parent_ml_id": entry.source_parent_ml_id,
"source_parent_chembl_id": entry.source_parent_chembl_id,
"cleavage_position": entry.cleavage_position,
"fragment_smiles_labeled": entry.fragment_smiles_labeled,
"fragment_smiles_plain": entry.fragment_smiles_plain,
"has_dummy_atom": entry.has_dummy_atom,
"dummy_atom_count": entry.dummy_atom_count,
"splice_ready": entry.splice_ready,
"original_bond_type": entry.original_bond_type,
"created_at": entry.created_at,
}
for entry in entries
]
pd.DataFrame(data, columns=columns).to_csv(self.output_dir / "fragment_library.csv", index=False)
class MacrolactoneDetectionError(Exception):
"""Raised when macrolactone detection fails."""

View File

@@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone(
)
def build_macrolactone_with_fused_side_ring(
ring_size: int = 16,
fused_positions: tuple[int, int] = (5, 6),
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
position_a, position_b = fused_positions
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_bridge_side_chain(
ring_size: int = 16,
bridge_positions: tuple[int, int] = (5, 8),
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
position_a, position_b = bridge_positions
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_shared_atom_side_ring(
ring_size: int = 16,
position: int = 5,
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
atom_z = rwmol.AddAtom(Chem.Atom("C"))
ring_atom_idx = base.position_to_atom[position]
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_macrolactone_with_single_anchor_side_ring(
ring_size: int = 16,
position: int = 5,
side_chains: Mapping[int, str] | None = None,
) -> BuiltMacrolactone:
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
rwmol = Chem.RWMol(Chem.Mol(base.mol))
atom_x = rwmol.AddAtom(Chem.Atom("C"))
atom_y = rwmol.AddAtom(Chem.Atom("C"))
atom_z = rwmol.AddAtom(Chem.Atom("C"))
ring_atom_idx = base.position_to_atom[position]
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE)
mol = rwmol.GetMol()
Chem.SanitizeMol(mol)
return BuiltMacrolactone(
mol=mol,
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
position_to_atom=base.position_to_atom,
)
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
rwmol = Chem.RWMol()

View File

@@ -2,7 +2,12 @@ from rdkit import Chem
from macro_lactone_toolkit import MacrolactoneFragmenter
from .helpers import build_macrolactone
from .helpers import (
build_macrolactone,
build_macrolactone_with_fused_side_ring,
build_macrolactone_with_shared_atom_side_ring,
build_macrolactone_with_single_anchor_side_ring,
)
def test_fragmentation_returns_empty_list_without_sidechains():
@@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type():
neighbor = dummy_atom.GetNeighbors()[0]
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
assert bond.GetBondType() == Chem.BondType.DOUBLE
def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains():
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused")
assert {fragment.cleavage_position for fragment in result.fragments} == {10}
def test_fragmentation_skips_shared_atom_multi_anchor_component():
built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"})
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom")
assert {fragment.cleavage_position for fragment in result.fragments} == {11}
def test_fragmentation_allows_single_anchor_side_ring():
built = build_macrolactone_with_single_anchor_side_ring()
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring")
assert {fragment.cleavage_position for fragment in result.fragments} == {5}

View File

@@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter
from macro_lactone_toolkit.splicing.engine import splice_molecule
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
from .helpers import build_macrolactone, canonicalize
from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize
def test_splice_benzene_methyl():
@@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment():
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
assert canonicalize(product) == canonicalize(built.mol)
def test_prepare_scaffold_rejects_position_without_single_anchor_fragment():
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"):
prepare_macrolactone_scaffold(
built.smiles,
positions=[5],
ring_size=16,
)

View File

@@ -0,0 +1,57 @@
from __future__ import annotations
import json
import sqlite3
import pandas as pd
from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
from ..helpers import build_macrolactone_with_fused_side_ring
def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path):
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
input_path = tmp_path / "input.csv"
output_dir = tmp_path / "validation_output"
pd.DataFrame(
[
{
"ml_id": "ML00000001",
"IDs": "CHEMBL0001",
"smiles": built.smiles,
}
]
).to_csv(input_path, index=False)
validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0)
results = validator.run(input_path)
assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0}
with sqlite3.connect(output_dir / "fragments.db") as connection:
fragments = connection.execute(
"SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments"
).fetchall()
library_entries = connection.execute(
"""
SELECT source_type, source_parent_ml_id, source_parent_chembl_id,
cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready
FROM fragment_library_entries
"""
).fetchall()
assert fragments == [(10, 1, 1)]
assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)]
summary = pd.read_csv(output_dir / "summary.csv")
assert summary.loc[0, "num_sidechains"] == 1
assert json.loads(summary.loc[0, "cleavage_positions"]) == [10]
fragment_library = pd.read_csv(output_dir / "fragment_library.csv")
assert fragment_library.loc[0, "source_type"] == "validation_extract"
assert int(fragment_library.loc[0, "cleavage_position"]) == 10
assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True
assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1
assert bool(fragment_library.loc[0, "splice_ready"]) is True