# MacrolactoneDB 12-20元环验证方案设计 ## 1. 验证目标与范围 ### 1.1 目标 - 验证 `macro_lactone_toolkit` 对 MacrolactoneDB 12-20元环大环内酯的识别准确性 - 验证环编号正确性(位置1=羰基碳,位置2=酯键氧) - 验证侧链断裂功能(仅针对标准大环内酯) - 使用同位素标记技术标记裂解位置,便于后续拼接 ### 1.2 范围 - **数据来源**: `/data/MacrolactoneDB/ring12_20/temp.csv` (11,037分子) - **抽样策略**: 分层随机抽样10% (~1,100分子),按环大小12-20均匀分布 - **处理对象**: 仅处理 `classification="standard_macrolactone"` 的分子 - **输出**: 可视化图片 + SQLite数据库 + 汇总统计 ## 2. 数据库设计 (SQLModel) ### 2.1 模型定义 ```python from typing import Optional, List from sqlmodel import SQLModel, Field, Relationship from datetime import datetime from enum import Enum class ClassificationType(str, Enum): STANDARD = "standard_macrolactone" NON_STANDARD = "non_standard_macrocycle" NOT_MACROLACTONE = "not_macrolactone" class ProcessingStatus(str, Enum): PENDING = "pending" SUCCESS = "success" FAILED = "failed" SKIPPED = "skipped" # ==================== 主分子表 ==================== class ParentMolecule(SQLModel, table=True): """原始分子信息表""" __tablename__ = "parent_molecules" id: Optional[int] = Field(default=None, primary_key=True) # 原始数据 source_id: str = Field(index=True) # 来自CSV的IDs字段 molecule_name: Optional[str] = None # molecule_pref_name smiles: str = Field(index=True) # 分类结果 classification: ClassificationType = Field(index=True) ring_size: Optional[int] = Field(default=None, index=True) primary_reason_code: Optional[str] = None primary_reason_message: Optional[str] = None # 处理状态 processing_status: ProcessingStatus = Field(default=ProcessingStatus.PENDING) error_message: Optional[str] = None # 元数据 created_at: datetime = Field(default_factory=datetime.utcnow) processed_at: Optional[datetime] = None # 关系 fragments: List["SideChainFragment"] = Relationship(back_populates="parent") numbering: Optional["RingNumbering"] = Relationship(back_populates="parent") # ==================== 环编号表 ==================== class RingNumbering(SQLModel, table=True): """环编号详细信息""" __tablename__ = "ring_numberings" id: Optional[int] = Field(default=None, primary_key=True) parent_id: int = Field(foreign_key="parent_molecules.id", unique=True) # 环基本信息 ring_size: int carbonyl_carbon_idx: int # 位置1 ester_oxygen_idx: int # 位置2 # 原子映射 (JSON存储) position_to_atom: str # JSON: {"1": 5, "2": 10, ...} atom_to_position: str # JSON: {"5": 1, "10": 2, ...} # 关系 parent: Optional[ParentMolecule] = Relationship(back_populates="numbering") # ==================== 侧链片段表 ==================== class SideChainFragment(SQLModel, table=True): """裂解产生的侧链片段""" __tablename__ = "side_chain_fragments" id: Optional[int] = Field(default=None, primary_key=True) parent_id: int = Field(foreign_key="parent_molecules.id", index=True) # 片段标识 fragment_id: str = Field(index=True) # {source_id}_frag_{n} # 裂解位置信息 cleavage_position: int = Field(index=True) # 环上的断裂位置 attachment_atom_idx: int # 母环上的连接原子索引 attachment_atom_symbol: str # C, O, N等 # SMILES (关键:同位素标记用于标识裂解位置) fragment_smiles_labeled: str # 带同位素标记,如 [5*]CCO fragment_smiles_plain: str # 无标记,如 *CCO # 同位素标记值 (用于后续拼接) dummy_isotope: int # 裂解位置的编号,用于重建连接关系 # 物理化学性质 atom_count: int heavy_atom_count: int molecular_weight: float # 连接信息 (用于后续拼接) original_bond_type: str # SINGLE, DOUBLE, AROMATIC等 # 关系 parent: Optional[ParentMolecule] = Relationship(back_populates="fragments") # ==================== 验证结果表 ==================== class ValidationResult(SQLModel, table=True): """人工验证结果记录""" __tablename__ = "validation_results" id: Optional[int] = Field(default=None, primary_key=True) parent_id: int = Field(foreign_key="parent_molecules.id") # 验证字段 numbering_correct: Optional[bool] = None # 编号是否正确 cleavage_correct: Optional[bool] = None # 裂解位置是否正确 classification_correct: Optional[bool] = None # 分类是否正确 # 备注 notes: Optional[str] = None validated_by: Optional[str] = None validated_at: Optional[datetime] = None ``` ### 2.2 数据库初始化 ```python from sqlmodel import create_engine, Session, SQLModel from contextlib import contextmanager # SQLite文件数据库 DATABASE_URL = "sqlite:///./validation_output/fragments.db" engine = create_engine(DATABASE_URL, echo=False) @contextmanager def get_session(): with Session(engine) as session: yield session def init_database(): """创建所有表""" SQLModel.metadata.create_all(engine) ``` ## 3. 同位素标记方案 (借鉴Molassembler) ### 3.1 标记策略 ```python def build_fragment_smiles_with_isotope( mol: Chem.Mol, side_chain_atoms: list[int], side_chain_start_idx: int, ring_atom_idx: int, cleavage_position: int, # 使用位置编号作为同位素值 ) -> str: """ 构建带同位素标记的片段SMILES 关键:用cleavage_position作为dummy原子的同位素值 这样后续拼接时能精确知道片段来自哪个位置 """ # 创建可编辑分子 emol = Chem.EditableMol(Chem.Mol(mol)) # 添加dummy原子替代连接点 dummy_atom = Chem.Atom(0) # 原子序数0 = dummy dummy_atom.SetIsotope(cleavage_position) # 【关键】用位置编号标记 dummy_idx = emol.AddAtom(dummy_atom) # 获取原始键类型 bond = mol.GetBondBetweenAtoms(ring_atom_idx, side_chain_start_idx) bond_type = bond.GetBondType() # 添加dummy原子与侧链起始原子的键 emol.AddBond(dummy_idx, side_chain_start_idx, bond_type) # 只保留dummy原子和侧链原子 atoms_to_keep = set([dummy_idx] + list(side_chain_atoms)) # 标记要删除的原子 for atom_idx in range(mol.GetNumAtoms()): if atom_idx not in atoms_to_keep: emol.RemoveAtom(atom_idx) fragment = emol.GetMol() Chem.SanitizeMol(fragment) return Chem.MolToSmiles(fragment) ``` ### 3.2 标记示例 | 裂解位置 | 原始结构 | 标记后SMILES | 说明 | |---------|---------|-------------|------| | 5 | `...C5(CCO)...` | `[5*]CCO` | dummy同位素=5,表示位置5的侧链 | | 12 | `...C12(CCCO)...` | `[12*]CCCO` | dummy同位素=12,表示位置12的侧链 | ### 3.3 拼接时使用标记 ```python # 后续拼接时,可以通过同位素值找到对应的裂解位置 def find_attachment_position(fragment_smiles: str) -> int: """从片段SMILES中提取裂解位置""" mol = Chem.MolFromSmiles(fragment_smiles) for atom in mol.GetAtoms(): if atom.GetAtomicNum() == 0 and atom.GetIsotope() > 0: return atom.GetIsotope() # 返回位置编号 return 0 ``` ## 4. 输出目录结构 ``` validation_output/ ├── README.md # 目录结构说明 ├── fragments.db # SQLite数据库 ├── summary.csv # 主汇总表 (所有分子) ├── summary_statistics.json # 统计信息 │ ├── ring_size_12/ # 12元环 ├── ring_size_13/ # 13元环 ├── ring_size_14/ # 14元环 ├── ring_size_15/ # 15元环 ├── ring_size_16/ # 16元环 ├── ring_size_17/ # 17元环 ├── ring_size_18/ # 18元环 ├── ring_size_19/ # 19元环 └── ring_size_20/ # 20元环 │ ├── molecules.csv # 该环大小的所有分子 │ ├── standard/ # 标准大环内酯 │ ├── numbered/ # 带编号的高亮环图 │ │ ├── {source_id}_numbered.png │ │ └── ... │ │ │ └── sidechains/ # 侧链片段图 │ └── {source_id}/ │ ├── {source_id}_frag_0_pos{pos}.png # 位置信息在文件名 │ ├── {source_id}_frag_1_pos{pos}.png │ └── ... │ ├── non_standard/ # 非标准大环 │ └── original/ │ ├── {source_id}_original.png │ └── ... │ └── rejected/ # 被拒绝的分子 └── original/ ├── {source_id}_original.png └── ... ``` ## 5. CSV字段设计 ### 5.1 summary.csv | 字段名 | 类型 | 说明 | |--------|------|------| | `id` | int | 数据库主键 | | `source_id` | str | 原始IDs字段 | | `molecule_name` | str | 分子名称 | | `smiles` | str | 原始SMILES | | `classification` | str | standard/non_standard/not_macrolactone | | `ring_size` | int | 检测到的环大小 | | `primary_reason_code` | str | 分类原因代码 | | `primary_reason_message` | str | 分类原因描述 | | `processing_status` | str | pending/success/failed/skipped | | `error_message` | str | 错误信息 | | `num_sidechains` | int | 侧链数量 | | `cleavage_positions` | str | JSON数组 [5, 8, 12] | | `numbered_image_path` | str | 编号图相对路径 | | `processed_at` | datetime | 处理时间 | ### 5.2 fragments.csv (每个分子的侧链详情) | 字段名 | 类型 | 说明 | |--------|------|------| | `fragment_id` | str | 唯一标识 | | `source_id` | str | 母分子ID | | `cleavage_position` | int | 环上断裂位置 | | `attachment_atom_idx` | int | 连接原子索引 | | `attachment_atom_symbol` | str | 连接原子类型 | | `fragment_smiles_labeled` | str | 带同位素标记的SMILES | | `fragment_smiles_plain` | str | 无标记SMILES | | `dummy_isotope` | int | 标记值=裂解位置 | | `atom_count` | int | 原子数 | | `molecular_weight` | float | 分子量 | | `original_bond_type` | str | 原始键类型 | | `image_path` | str | 片段图路径 | ## 6. 验证脚本架构 ```python #!/usr/bin/env python3 """ MacrolactoneDB 12-20元环验证脚本 使用: pixi run python scripts/validate_macrolactone_db.py """ import pandas as pd from sqlmodel import Session from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter from macro_lactone_toolkit.visualization import save_numbered_molecule_png class MacrolactoneValidator: def __init__(self, sample_ratio=0.1, output_dir="validation_output"): self.analyzer = MacroLactoneAnalyzer() self.fragmenter = MacrolactoneFragmenter() self.sample_ratio = sample_ratio self.output_dir = Path(output_dir) def run(self, input_csv: str): # 1. 加载数据 df = pd.read_csv(input_csv) # 2. 分层抽样 sampled = self._stratified_sample(df) # 3. 初始化数据库 init_database() # 4. 处理每个分子 for _, row in sampled.iterrows(): self._process_molecule(row) # 5. 生成汇总 self._generate_summary() def _stratified_sample(self, df: pd.DataFrame) -> pd.DataFrame: """按环大小分层抽样""" # 先分类所有分子 df['classification'] = df['smiles'].apply( lambda s: self.analyzer.classify_macrocycle(s).classification ) df['ring_size'] = df['smiles'].apply( lambda s: self.analyzer.classify_macrocycle(s).ring_size ) # 按ring_size分层,每层抽10% sampled = df.groupby('ring_size').apply( lambda x: x.sample(frac=self.sample_ratio, random_state=42) ).reset_index(drop=True) return sampled def _process_molecule(self, row: pd.Series): """处理单个分子""" source_id = row['IDs'] smiles = row['smiles'] classification = row['classification'] ring_size = row['ring_size'] # 保存到ParentMolecule表 parent = ParentMolecule( source_id=source_id, molecule_name=row.get('molecule_pref_name'), smiles=smiles, classification=classification, ring_size=ring_size, ) if classification != ClassificationType.STANDARD: parent.processing_status = ProcessingStatus.SKIPPED self._save_image(smiles, ring_size, source_id, classification) return # 处理标准大环内酯 try: # 环编号 numbering = self.fragmenter.number_molecule(smiles) self._save_numbering_to_db(parent.id, numbering) # 生成编号图 self._save_numbered_image(smiles, ring_size, source_id) # 侧链断裂 result = self.fragmenter.fragment_molecule(smiles, parent_id=source_id) self._save_fragments_to_db(parent.id, result) self._save_fragment_images(result, source_id) parent.processing_status = ProcessingStatus.SUCCESS parent.num_sidechains = len(result.fragments) parent.cleavage_positions = json.dumps([f.cleavage_position for f in result.fragments]) except Exception as e: parent.processing_status = ProcessingStatus.FAILED parent.error_message = str(e) parent.processed_at = datetime.utcnow() with get_session() as session: session.add(parent) session.commit() ``` ## 7. 执行命令 ```bash # 进入项目目录 cd /Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split # 激活pixi环境 pixi shell # 运行验证脚本 python scripts/validate_macrolactone_db.py \ --input data/MacrolactoneDB/ring12_20/temp.csv \ --output validation_output \ --sample-ratio 0.1 # 查看数据库 sqlite3 validation_output/fragments.db ".tables" sqlite3 validation_output/fragments.db "SELECT * FROM parent_molecules LIMIT 5;" ``` ## 8. 人工检查清单 ### 8.1 编号正确性检查 - [ ] 位置1是否为内酯羰基碳(C=O) - [ ] 位置2是否为酯键氧(-O-) - [ ] 编号是否沿统一方向连续 - [ ] 桥环/非标准环是否被正确跳过 ### 8.2 裂解正确性检查 - [ ] 位置1和2是否有侧链(应该没有,是内酯本身) - [ ] 位置3-N的侧链是否正确识别 - [ ] dummy原子是否正确标记裂解位置 - [ ] 键型是否保持(单键/双键) ### 8.3 分类准确性检查 - [ ] 标准大环内酯是否被正确识别 - [ ] 非标准大环是否被正确分类并跳过 - [ ] 非大环内酯是否被拒绝 ## 9. 后续拼接接口设计 ```python def get_fragment_by_position(db_path: str, source_id: str, position: int) -> Optional[SideChainFragment]: """通过位置获取片段,用于后续拼接""" engine = create_engine(f"sqlite:///{db_path}") with Session(engine) as session: statement = select(SideChainFragment).where( SideChainFragment.parent_id == source_id, SideChainFragment.cleavage_position == position ) return session.exec(statement).first() def get_all_positions_for_parent(db_path: str, source_id: str) -> List[int]: """获取某分子的所有裂解位置""" engine = create_engine(f"sqlite:///{db_path}") with Session(engine) as session: statement = select(SideChainFragment.cleavage_position).where( SideChainFragment.parent_id == source_id ) return [r[0] for r in session.exec(statement).all()] ```