Files
macrolactone-toolkit/src/macro_lactone_toolkit/validation/models.py
lingyuzeng 46a438dd36 feat(validation): enforce single-anchor fragments
- skip fused/shared/multi-anchor side systems during extraction
- add fragment library schema and fragment_library.csv export
- make scaffold prep strict for non-spliceable positions
2026-03-19 14:20:32 +08:00

117 lines
4.0 KiB
Python

from __future__ import annotations
from datetime import UTC, datetime
from typing import List, Optional
from sqlalchemy.orm import Mapped, mapped_column, relationship
from sqlmodel import Field, SQLModel
class ClassificationType:
STANDARD = "standard_macrolactone"
NON_STANDARD = "non_standard_macrocycle"
NOT_MACROLACTONE = "not_macrolactone"
class ProcessingStatus:
PENDING = "pending"
SUCCESS = "success"
FAILED = "failed"
SKIPPED = "skipped"
# Define all tables without relationships first
class ParentMolecule(SQLModel, table=True):
"""Original molecule information."""
__tablename__ = "parent_molecules"
id: Optional[int] = Field(default=None, primary_key=True)
ml_id: str = Field(index=True) # MacrolactoneDB unique ID (e.g., ML00000001)
chembl_id: Optional[str] = Field(default=None, index=True) # Original CHEMBL ID
molecule_name: Optional[str] = None
smiles: str = Field(index=True)
classification: str = Field(index=True)
ring_size: Optional[int] = Field(default=None, index=True)
primary_reason_code: Optional[str] = None
primary_reason_message: Optional[str] = None
processing_status: str = Field(default=ProcessingStatus.PENDING)
error_message: Optional[str] = None
num_sidechains: Optional[int] = None
cleavage_positions: Optional[str] = None
numbered_image_path: Optional[str] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
processed_at: Optional[datetime] = None
class RingNumbering(SQLModel, table=True):
"""Ring numbering details."""
__tablename__ = "ring_numberings"
id: Optional[int] = Field(default=None, primary_key=True)
parent_id: int = Field(foreign_key="parent_molecules.id", unique=True)
ring_size: int
carbonyl_carbon_idx: int
ester_oxygen_idx: int
position_to_atom: str
atom_to_position: str
class SideChainFragment(SQLModel, table=True):
"""Side chain fragments from cleavage."""
__tablename__ = "side_chain_fragments"
id: Optional[int] = Field(default=None, primary_key=True)
parent_id: int = Field(foreign_key="parent_molecules.id", index=True)
fragment_id: str = Field(index=True)
cleavage_position: int = Field(index=True)
attachment_atom_idx: int
attachment_atom_symbol: str
fragment_smiles_labeled: str
fragment_smiles_plain: str
dummy_isotope: int
has_dummy_atom: bool = Field(default=True)
dummy_atom_count: int = Field(default=1)
atom_count: int
heavy_atom_count: int
molecular_weight: float
original_bond_type: str
image_path: Optional[str] = None
class FragmentLibraryEntry(SQLModel, table=True):
"""Unified fragment library entries."""
__tablename__ = "fragment_library_entries"
id: Optional[int] = Field(default=None, primary_key=True)
source_type: str = Field(index=True)
source_fragment_id: Optional[str] = Field(default=None, index=True)
source_parent_ml_id: Optional[str] = Field(default=None, index=True)
source_parent_chembl_id: Optional[str] = Field(default=None, index=True)
cleavage_position: Optional[int] = Field(default=None, index=True)
fragment_smiles_labeled: Optional[str] = None
fragment_smiles_plain: str
has_dummy_atom: bool = Field(default=False)
dummy_atom_count: int = Field(default=0)
splice_ready: bool = Field(default=False, index=True)
original_bond_type: Optional[str] = None
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
class ValidationResult(SQLModel, table=True):
"""Manual validation records."""
__tablename__ = "validation_results"
id: Optional[int] = Field(default=None, primary_key=True)
parent_id: int = Field(foreign_key="parent_molecules.id")
numbering_correct: Optional[bool] = None
cleavage_correct: Optional[bool] = None
classification_correct: Optional[bool] = None
notes: Optional[str] = None
validated_by: Optional[str] = None
validated_at: Optional[datetime] = None