refactor(validation): use ml_id as primary ID, add chembl_id field
This commit is contained in:
@@ -50,8 +50,14 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--id-col",
|
"--id-col",
|
||||||
type=str,
|
type=str,
|
||||||
|
default="ml_id",
|
||||||
|
help="ID column name (default: ml_id)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--chembl-id-col",
|
||||||
|
type=str,
|
||||||
default="IDs",
|
default="IDs",
|
||||||
help="ID column name",
|
help="CHEMBL ID column name (default: IDs)",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -69,6 +75,7 @@ def main():
|
|||||||
sample_ratio=args.sample_ratio,
|
sample_ratio=args.sample_ratio,
|
||||||
smiles_col=args.smiles_col,
|
smiles_col=args.smiles_col,
|
||||||
id_col=args.id_col,
|
id_col=args.id_col,
|
||||||
|
chembl_id_col=args.chembl_id_col,
|
||||||
)
|
)
|
||||||
|
|
||||||
results = validator.run(args.input)
|
results = validator.run(args.input)
|
||||||
|
|||||||
@@ -27,7 +27,8 @@ class ParentMolecule(SQLModel, table=True):
|
|||||||
__tablename__ = "parent_molecules"
|
__tablename__ = "parent_molecules"
|
||||||
|
|
||||||
id: Optional[int] = Field(default=None, primary_key=True)
|
id: Optional[int] = Field(default=None, primary_key=True)
|
||||||
source_id: str = Field(index=True)
|
ml_id: str = Field(index=True) # MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
chembl_id: Optional[str] = Field(default=None, index=True) # Original CHEMBL ID
|
||||||
molecule_name: Optional[str] = None
|
molecule_name: Optional[str] = None
|
||||||
smiles: str = Field(index=True)
|
smiles: str = Field(index=True)
|
||||||
classification: str = Field(index=True)
|
classification: str = Field(index=True)
|
||||||
|
|||||||
@@ -41,12 +41,14 @@ class MacrolactoneValidator:
|
|||||||
output_dir: str | Path,
|
output_dir: str | Path,
|
||||||
sample_ratio: float = 0.1,
|
sample_ratio: float = 0.1,
|
||||||
smiles_col: str = "smiles",
|
smiles_col: str = "smiles",
|
||||||
id_col: str = "IDs",
|
id_col: str = "ml_id",
|
||||||
|
chembl_id_col: str = "IDs",
|
||||||
):
|
):
|
||||||
self.output_dir = Path(output_dir)
|
self.output_dir = Path(output_dir)
|
||||||
self.sample_ratio = sample_ratio
|
self.sample_ratio = sample_ratio
|
||||||
self.smiles_col = smiles_col
|
self.smiles_col = smiles_col
|
||||||
self.id_col = id_col
|
self.id_col = id_col
|
||||||
|
self.chembl_id_col = chembl_id_col
|
||||||
|
|
||||||
self.analyzer = MacroLactoneAnalyzer()
|
self.analyzer = MacroLactoneAnalyzer()
|
||||||
|
|
||||||
@@ -83,7 +85,8 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
def _process_molecule(self, row: pd.Series) -> str:
|
def _process_molecule(self, row: pd.Series) -> str:
|
||||||
"""Process a single molecule. Returns status."""
|
"""Process a single molecule. Returns status."""
|
||||||
source_id = str(row[self.id_col])
|
ml_id = str(row[self.id_col])
|
||||||
|
chembl_id = str(row[self.chembl_id_col]) if self.chembl_id_col in row and pd.notna(row[self.chembl_id_col]) else None
|
||||||
smiles = row[self.smiles_col]
|
smiles = row[self.smiles_col]
|
||||||
name = row.get("molecule_pref_name", None)
|
name = row.get("molecule_pref_name", None)
|
||||||
|
|
||||||
@@ -105,7 +108,8 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Create parent record
|
# Create parent record
|
||||||
parent = ParentMolecule(
|
parent = ParentMolecule(
|
||||||
source_id=source_id,
|
ml_id=ml_id,
|
||||||
|
chembl_id=chembl_id,
|
||||||
molecule_name=name,
|
molecule_name=name,
|
||||||
smiles=smiles,
|
smiles=smiles,
|
||||||
classification=classification,
|
classification=classification,
|
||||||
@@ -124,7 +128,7 @@ class MacrolactoneValidator:
|
|||||||
parent.processing_status = ProcessingStatus.SKIPPED
|
parent.processing_status = ProcessingStatus.SKIPPED
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
self._save_original_image(smiles, source_id, ring_size, classification)
|
self._save_original_image(smiles, ml_id, ring_size, classification)
|
||||||
return "skipped"
|
return "skipped"
|
||||||
|
|
||||||
# Process standard macrolactone
|
# Process standard macrolactone
|
||||||
@@ -172,7 +176,7 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Save numbered image
|
# Save numbered image
|
||||||
paths = get_output_paths(
|
paths = get_output_paths(
|
||||||
self.output_dir, parent.source_id, parent.ring_size, "standard_macrolactone"
|
self.output_dir, parent.ml_id, parent.ring_size, "standard_macrolactone"
|
||||||
)
|
)
|
||||||
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
||||||
if image_path:
|
if image_path:
|
||||||
@@ -217,7 +221,7 @@ class MacrolactoneValidator:
|
|||||||
# Create fragment record
|
# Create fragment record
|
||||||
fragment = SideChainFragment(
|
fragment = SideChainFragment(
|
||||||
parent_id=parent.id,
|
parent_id=parent.id,
|
||||||
fragment_id=f"{parent.source_id}_frag_{fragment_idx}",
|
fragment_id=f"{parent.ml_id}_frag_{fragment_idx}",
|
||||||
cleavage_position=int(position),
|
cleavage_position=int(position),
|
||||||
attachment_atom_idx=ring_atom_idx,
|
attachment_atom_idx=ring_atom_idx,
|
||||||
attachment_atom_symbol=ring_atom.GetSymbol(),
|
attachment_atom_symbol=ring_atom.GetSymbol(),
|
||||||
@@ -235,7 +239,7 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Save fragment images
|
# Save fragment images
|
||||||
if fragments and paths["sidechains_dir"]:
|
if fragments and paths["sidechains_dir"]:
|
||||||
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.source_id)
|
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.ml_id)
|
||||||
for frag, img_path in zip(fragments, image_paths):
|
for frag, img_path in zip(fragments, image_paths):
|
||||||
frag.image_path = img_path
|
frag.image_path = img_path
|
||||||
session.add(frag)
|
session.add(frag)
|
||||||
@@ -248,9 +252,9 @@ class MacrolactoneValidator:
|
|||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
def _save_original_image(self, smiles: str, source_id: str, ring_size: int, classification: str):
|
def _save_original_image(self, smiles: str, ml_id: str, ring_size: int, classification: str):
|
||||||
"""Save original image for non-standard molecules."""
|
"""Save original image for non-standard molecules."""
|
||||||
paths = get_output_paths(self.output_dir, source_id, ring_size, classification)
|
paths = get_output_paths(self.output_dir, ml_id, ring_size, classification)
|
||||||
try:
|
try:
|
||||||
from rdkit.Chem import Draw
|
from rdkit.Chem import Draw
|
||||||
|
|
||||||
@@ -325,7 +329,8 @@ Fragments use isotope values to mark cleavage position:
|
|||||||
|
|
||||||
### summary.csv
|
### summary.csv
|
||||||
|
|
||||||
- `source_id`: Original molecule ID from MacrolactoneDB
|
- `ml_id`: MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
- `chembl_id`: Original CHEMBL ID (if available)
|
||||||
- `classification`: Classification result
|
- `classification`: Classification result
|
||||||
- `ring_size`: Detected ring size (12-20)
|
- `ring_size`: Detected ring size (12-20)
|
||||||
- `num_sidechains`: Number of side chains detected
|
- `num_sidechains`: Number of side chains detected
|
||||||
@@ -363,7 +368,8 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
|
|||||||
for p in parents:
|
for p in parents:
|
||||||
data.append({
|
data.append({
|
||||||
"id": p.id,
|
"id": p.id,
|
||||||
"source_id": p.source_id,
|
"ml_id": p.ml_id,
|
||||||
|
"chembl_id": p.chembl_id,
|
||||||
"molecule_name": p.molecule_name,
|
"molecule_name": p.molecule_name,
|
||||||
"smiles": p.smiles,
|
"smiles": p.smiles,
|
||||||
"classification": p.classification,
|
"classification": p.classification,
|
||||||
|
|||||||
85
validation_output/README.md
Normal file
85
validation_output/README.md
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
# MacrolactoneDB Validation Output
|
||||||
|
|
||||||
|
This directory contains validation results for MacrolactoneDB 12-20 membered rings.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
validation_output/
|
||||||
|
├── README.md # This file
|
||||||
|
├── fragments.db # SQLite database with all data
|
||||||
|
├── summary.csv # Summary of all processed molecules
|
||||||
|
├── summary_statistics.json # Statistical summary
|
||||||
|
│
|
||||||
|
├── ring_size_12/ # 12-membered rings
|
||||||
|
├── ring_size_13/ # 13-membered rings
|
||||||
|
...
|
||||||
|
└── ring_size_20/ # 20-membered rings
|
||||||
|
├── molecules.csv # Molecules in this ring size
|
||||||
|
├── standard/ # Standard macrolactones
|
||||||
|
│ ├── numbered/ # Numbered ring images
|
||||||
|
│ │ └── {id}_numbered.png
|
||||||
|
│ └── sidechains/ # Fragment images
|
||||||
|
│ └── {id}/
|
||||||
|
│ └── {id}_frag_{n}_pos{pos}.png
|
||||||
|
├── non_standard/ # Non-standard macrocycles
|
||||||
|
│ └── original/
|
||||||
|
│ └── {id}_original.png
|
||||||
|
└── rejected/ # Not macrolactones
|
||||||
|
└── original/
|
||||||
|
└── {id}_original.png
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
### Tables
|
||||||
|
|
||||||
|
- **parent_molecules**: Original molecule information
|
||||||
|
- **ring_numberings**: Ring atom numbering details
|
||||||
|
- **side_chain_fragments**: Fragmentation results with isotope tags
|
||||||
|
- **validation_results**: Manual validation records
|
||||||
|
|
||||||
|
### Key Fields
|
||||||
|
|
||||||
|
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
||||||
|
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
||||||
|
- `cleavage_position`: Position on ring where side chain was attached
|
||||||
|
|
||||||
|
## Ring Numbering Convention
|
||||||
|
|
||||||
|
1. Position 1 = Lactone carbonyl carbon (C=O)
|
||||||
|
2. Position 2 = Ester oxygen (-O-)
|
||||||
|
3. Positions 3-N = Sequential around ring
|
||||||
|
|
||||||
|
## Isotope Tagging
|
||||||
|
|
||||||
|
Fragments use isotope values to mark cleavage position:
|
||||||
|
- `[5*]CCO` = Fragment from position 5, dummy atom has isotope=5
|
||||||
|
- This enables precise reconstruction during reassembly
|
||||||
|
|
||||||
|
## CSV Columns
|
||||||
|
|
||||||
|
### summary.csv
|
||||||
|
|
||||||
|
- `source_id`: Original molecule ID from MacrolactoneDB
|
||||||
|
- `classification`: Classification result
|
||||||
|
- `ring_size`: Detected ring size (12-20)
|
||||||
|
- `num_sidechains`: Number of side chains detected
|
||||||
|
- `cleavage_positions`: JSON array of cleavage positions
|
||||||
|
- `processing_status`: pending | success | failed | skipped
|
||||||
|
|
||||||
|
## Querying the Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List tables
|
||||||
|
sqlite3 fragments.db ".tables"
|
||||||
|
|
||||||
|
# Get standard macrolactones with fragments
|
||||||
|
sqlite3 fragments.db "SELECT * FROM parent_molecules WHERE classification='standard_macrolactone' LIMIT 5;"
|
||||||
|
|
||||||
|
# Get fragments for a specific molecule
|
||||||
|
sqlite3 fragments.db "SELECT * FROM side_chain_fragments WHERE parent_id=1;"
|
||||||
|
|
||||||
|
# Count by ring size
|
||||||
|
sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
|
||||||
|
```
|
||||||
BIN
validation_output/fragments.db
Normal file
BIN
validation_output/fragments.db
Normal file
Binary file not shown.
1098
validation_output/summary.csv
Normal file
1098
validation_output/summary.csv
Normal file
File diff suppressed because it is too large
Load Diff
24
validation_output/summary_statistics.json
Normal file
24
validation_output/summary_statistics.json
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"total_molecules": 1097,
|
||||||
|
"by_classification": {
|
||||||
|
"non_standard_macrocycle": 617,
|
||||||
|
"standard_macrolactone": 459,
|
||||||
|
"not_macrolactone": 21
|
||||||
|
},
|
||||||
|
"by_ring_size": {
|
||||||
|
"14.0": 301,
|
||||||
|
"16.0": 187,
|
||||||
|
"15.0": 161,
|
||||||
|
"12.0": 141,
|
||||||
|
"19.0": 85,
|
||||||
|
"18.0": 80,
|
||||||
|
"13.0": 67,
|
||||||
|
"20.0": 24,
|
||||||
|
"17.0": 19
|
||||||
|
},
|
||||||
|
"by_status": {
|
||||||
|
"skipped": 638,
|
||||||
|
"success": 367,
|
||||||
|
"failed": 92
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user