Compare commits
6 Commits
360cbc487e
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 8071a141ee | |||
| f6bf9e85a3 | |||
| 46a438dd36 | |||
| 07ba27be2b | |||
| f43f0520ce | |||
| bb42044faf |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -68,3 +68,4 @@ data/
|
|||||||
output/
|
output/
|
||||||
site/
|
site/
|
||||||
# docs/ source files should be tracked, only ignore generated site/
|
# docs/ source files should be tracked, only ignore generated site/
|
||||||
|
validation_output/
|
||||||
|
|||||||
90
README.md
90
README.md
@@ -62,6 +62,96 @@ pixi run macro-lactone-toolkit fragment \
|
|||||||
|
|
||||||
默认读取 `smiles` 列;若存在 `id` 列则将其作为 `parent_id`,否则自动生成 `row_<index>`。
|
默认读取 `smiles` 列;若存在 `id` 列则将其作为 `parent_id`,否则自动生成 `row_<index>`。
|
||||||
|
|
||||||
|
## MacrolactoneDB 验证模块
|
||||||
|
|
||||||
|
用于对 MacrolactoneDB 数据库进行抽样验证、分类、侧链断裂和数据库存储。
|
||||||
|
|
||||||
|
### 验证脚本使用
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 基本使用(10% 分层抽样)
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data/MacrolactoneDB/ring12_20/temp.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--sample-ratio 0.1
|
||||||
|
|
||||||
|
# 处理全量数据
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data/MacrolactoneDB/ring12_20/temp.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--sample-ratio 1.0
|
||||||
|
|
||||||
|
# 指定列名(如果 CSV 列名不同)
|
||||||
|
pixi run python scripts/validate_macrolactone_db.py \
|
||||||
|
--input data.csv \
|
||||||
|
--output validation_output \
|
||||||
|
--id-col ml_id \
|
||||||
|
--chembl-id-col IDs \
|
||||||
|
--smiles-col smiles
|
||||||
|
```
|
||||||
|
|
||||||
|
### 输出结构
|
||||||
|
|
||||||
|
```
|
||||||
|
validation_output/
|
||||||
|
├── README.md # 目录说明
|
||||||
|
├── fragments.db # SQLite 数据库
|
||||||
|
├── fragment_library.csv # 最终片段库导出(含 has_dummy_atom / splice_ready)
|
||||||
|
├── summary.csv # 汇总表(含 ml_id, chembl_id)
|
||||||
|
├── summary_statistics.json # 统计信息
|
||||||
|
├── ring_size_12/ # 按环大小组织
|
||||||
|
├── ring_size_13/
|
||||||
|
...
|
||||||
|
└── ring_size_20/
|
||||||
|
├── standard/
|
||||||
|
│ ├── numbered/ # 带编号的环图(文件名使用 ml_id)
|
||||||
|
│ │ └── {ml_id}_numbered.png
|
||||||
|
│ └── sidechains/ # 片段图
|
||||||
|
│ └── {ml_id}/
|
||||||
|
│ └── {ml_id}_frag_{n}_pos{pos}.png
|
||||||
|
├── non_standard/original/
|
||||||
|
└── rejected/original/
|
||||||
|
```
|
||||||
|
|
||||||
|
### 数据库查询示例
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 查看表结构
|
||||||
|
sqlite3 validation_output/fragments.db ".tables"
|
||||||
|
|
||||||
|
# 查询标准大环内酯
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT ml_id, chembl_id, ring_size, num_sidechains \
|
||||||
|
FROM parent_molecules \
|
||||||
|
WHERE classification='standard_macrolactone' LIMIT 5;"
|
||||||
|
|
||||||
|
# 查询最终片段库
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT source_type, source_parent_ml_id, cleavage_position, has_dummy_atom, splice_ready \
|
||||||
|
FROM fragment_library_entries LIMIT 10;"
|
||||||
|
|
||||||
|
# 查询片段
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT fragment_id, cleavage_position, dummy_isotope, has_dummy_atom, dummy_atom_count \
|
||||||
|
FROM side_chain_fragments LIMIT 10;"
|
||||||
|
|
||||||
|
# 按环大小统计
|
||||||
|
sqlite3 validation_output/fragments.db \
|
||||||
|
"SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 关键字段说明
|
||||||
|
|
||||||
|
| 字段 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `ml_id` | MacrolactoneDB 唯一 ID(如 ML00000001),用于文件命名 |
|
||||||
|
| `chembl_id` | 原始 CHEMBL ID(如 CHEMBL94657),可能为空 |
|
||||||
|
| `classification` | standard_macrolactone / non_standard_macrocycle / not_macrolactone |
|
||||||
|
| `dummy_isotope` | 裂解位置编号,用于片段重建 |
|
||||||
|
| `cleavage_position` | 环上的断裂位置 |
|
||||||
|
| `has_dummy_atom` | 该片段是否带 dummy 原子,可用于区分可直接拼接片段 |
|
||||||
|
| `splice_ready` | 是否与当前单锚点拼接流程直接兼容 |
|
||||||
|
|
||||||
## Legacy Scripts
|
## Legacy Scripts
|
||||||
|
|
||||||
`scripts/` 目录保留为薄封装或迁移提示,不再承载核心实现。正式接口以 `macro_lactone_toolkit.*` 与 `macro-lactone-toolkit` CLI 为准。
|
`scripts/` 目录保留为薄封装或迁移提示,不再承载核心实现。正式接口以 `macro_lactone_toolkit.*` 与 `macro-lactone-toolkit` CLI 为准。
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ rdkit = ">=2025.9.1,<2026"
|
|||||||
pandas = ">=2.3.3,<3"
|
pandas = ">=2.3.3,<3"
|
||||||
numpy = ">=2.3.4,<3"
|
numpy = ">=2.3.4,<3"
|
||||||
matplotlib = ">=3.10,<4"
|
matplotlib = ">=3.10,<4"
|
||||||
|
seaborn = ">=0.13,<0.14"
|
||||||
sqlmodel = ">=0.0.37,<0.0.38"
|
sqlmodel = ">=0.0.37,<0.0.38"
|
||||||
|
|
||||||
[pypi-dependencies]
|
[pypi-dependencies]
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ dependencies = [
|
|||||||
"matplotlib>=3.10",
|
"matplotlib>=3.10",
|
||||||
"numpy>=1.26",
|
"numpy>=1.26",
|
||||||
"pandas>=2.2",
|
"pandas>=2.2",
|
||||||
|
"seaborn>=0.13",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
105
scripts/analyze_fragment_atom_counts.py
Normal file
105
scripts/analyze_fragment_atom_counts.py
Normal file
@@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
分析 fragment_library.csv 中碎片的原子数分布。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from rdkit import Chem
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def count_atoms(smiles):
|
||||||
|
"""计算 SMILES 中的原子数(不包括 dummy atom)"""
|
||||||
|
try:
|
||||||
|
mol = Chem.MolFromSmiles(smiles)
|
||||||
|
if mol is None:
|
||||||
|
return None
|
||||||
|
# 计算所有原子,但排除 dummy atom(原子序数为 0)
|
||||||
|
atom_count = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 0)
|
||||||
|
return atom_count
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 读取 CSV 文件
|
||||||
|
csv_path = "validation_output/fragment_library.csv"
|
||||||
|
if not os.path.exists(csv_path):
|
||||||
|
print(f"文件不存在: {csv_path}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
df = pd.read_csv(csv_path)
|
||||||
|
print(f"总碎片数: {len(df)}")
|
||||||
|
|
||||||
|
# 计算原子数
|
||||||
|
df["atom_count"] = df["fragment_smiles_plain"].apply(count_atoms)
|
||||||
|
|
||||||
|
# 检查是否有无效的 SMILES
|
||||||
|
invalid_count = df["atom_count"].isna().sum()
|
||||||
|
if invalid_count > 0:
|
||||||
|
print(f"无效 SMILES 数量: {invalid_count}")
|
||||||
|
|
||||||
|
# 过滤掉无效的 SMILES
|
||||||
|
df_valid = df.dropna(subset=["atom_count"])
|
||||||
|
print(f"有效碎片数: {len(df_valid)}")
|
||||||
|
|
||||||
|
# 统计描述
|
||||||
|
print("\n原子数分布统计:")
|
||||||
|
print(df_valid["atom_count"].describe())
|
||||||
|
|
||||||
|
# 绘制分布图
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
sns.histplot(
|
||||||
|
df_valid["atom_count"],
|
||||||
|
bins=range(0, int(df_valid["atom_count"].max()) + 2),
|
||||||
|
kde=False,
|
||||||
|
)
|
||||||
|
plt.title("Fragment Atom Count Distribution")
|
||||||
|
plt.xlabel("Atom Count")
|
||||||
|
plt.ylabel("Frequency")
|
||||||
|
plt.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# 保存图片
|
||||||
|
output_dir = "validation_output"
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
plot_path = os.path.join(output_dir, "fragment_atom_count_distribution.png")
|
||||||
|
plt.savefig(plot_path, dpi=300, bbox_inches="tight")
|
||||||
|
print(f"\n分布图已保存至: {plot_path}")
|
||||||
|
|
||||||
|
# 显示图片(如果在交互式环境中)
|
||||||
|
try:
|
||||||
|
plt.show()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 分析不同原子数的碎片数量
|
||||||
|
atom_count_stats = df_valid["atom_count"].value_counts().sort_index()
|
||||||
|
print("\n不同原子数的碎片数量:")
|
||||||
|
for atom_count, count in atom_count_stats.items():
|
||||||
|
print(f" 原子数 {atom_count}: {count} 个碎片")
|
||||||
|
|
||||||
|
# 计算累积百分比
|
||||||
|
total_fragments = len(df_valid)
|
||||||
|
cumulative = 0
|
||||||
|
print("\n累积分布:")
|
||||||
|
for atom_count, count in atom_count_stats.items():
|
||||||
|
cumulative += count
|
||||||
|
percentage = (cumulative / total_fragments) * 100
|
||||||
|
print(f" 原子数 <= {atom_count}: {cumulative} 个碎片 ({percentage:.2f}%)")
|
||||||
|
|
||||||
|
# 建议筛选标准
|
||||||
|
print("\n建议筛选标准:")
|
||||||
|
# 例如,过滤掉原子数小于 3 的碎片
|
||||||
|
min_atom_count = 3
|
||||||
|
filtered_count = len(df_valid[df_valid["atom_count"] >= min_atom_count])
|
||||||
|
filtered_percentage = (filtered_count / total_fragments) * 100
|
||||||
|
print(
|
||||||
|
f" 如果过滤掉原子数 < {min_atom_count} 的碎片,剩余 {filtered_count} 个碎片 ({filtered_percentage:.2f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
986
scripts/analyze_validation_fragment_library.py
Normal file
986
scripts/analyze_validation_fragment_library.py
Normal file
@@ -0,0 +1,986 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from math import ceil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import matplotlib
|
||||||
|
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
from rdkit import Chem
|
||||||
|
from rdkit.Chem import rdMolDescriptors
|
||||||
|
|
||||||
|
from macro_lactone_toolkit.validation.fragment_library_analysis import (
|
||||||
|
build_filter_candidate_table,
|
||||||
|
build_fragment_atom_count_frequency_table,
|
||||||
|
build_fragment_atom_count_summary,
|
||||||
|
build_position_diversity_table,
|
||||||
|
load_fragment_library_dataset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Analyze validation_output fragment_library.csv and generate atom-count and diversity reports."
|
||||||
|
)
|
||||||
|
parser.add_argument("--input", required=True, help="Path to validation_output/fragment_library.csv")
|
||||||
|
parser.add_argument("--db", required=True, help="Path to validation_output/fragments.db")
|
||||||
|
parser.add_argument("--output-dir", required=True, help="Directory to write plots and CSV summaries")
|
||||||
|
parser.add_argument("--ring-size", type=int, default=16, help="Ring size to analyze position-wise diversity for")
|
||||||
|
parser.add_argument(
|
||||||
|
"--design-min-atoms",
|
||||||
|
type=int,
|
||||||
|
default=4,
|
||||||
|
help="Minimum heavy-atom count to retain for design-oriented side-chain analysis",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--medchem-positions",
|
||||||
|
default="6,7,15,16",
|
||||||
|
help="Comma-separated cleavage positions to highlight as literature or medicinal-chemistry hotspots",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def plot_fragment_atom_count_distribution(dataframe: pd.DataFrame, output_path: Path) -> None:
|
||||||
|
figure, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||||||
|
|
||||||
|
sns.histplot(data=dataframe, x="fragment_atom_count", discrete=True, ax=axes[0], color="#3b82f6")
|
||||||
|
axes[0].set_title("Fragment Atom Count Distribution")
|
||||||
|
axes[0].set_xlabel("Fragment atom count")
|
||||||
|
axes[0].set_ylabel("Fragment rows")
|
||||||
|
|
||||||
|
sns.ecdfplot(data=dataframe, x="fragment_atom_count", ax=axes[1], color="#ef4444")
|
||||||
|
axes[1].set_title("Fragment Atom Count ECDF")
|
||||||
|
axes[1].set_xlabel("Fragment atom count")
|
||||||
|
axes[1].set_ylabel("Cumulative fraction")
|
||||||
|
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_ring_position_atom_count_distribution(dataframe: pd.DataFrame, ring_size: int, output_path: Path) -> None:
|
||||||
|
positions = sorted(dataframe["cleavage_position"].unique().tolist())
|
||||||
|
ncols = 4
|
||||||
|
nrows = ceil(len(positions) / ncols)
|
||||||
|
figure, axes = plt.subplots(nrows, ncols, figsize=(16, 3.8 * nrows), sharex=True, sharey=True)
|
||||||
|
axes_flat = axes.flatten()
|
||||||
|
|
||||||
|
for axis, position in zip(axes_flat, positions, strict=False):
|
||||||
|
subset = dataframe[dataframe["cleavage_position"] == position]
|
||||||
|
sns.histplot(data=subset, x="fragment_atom_count", discrete=True, ax=axis, color="#10b981")
|
||||||
|
axis.set_title(f"Position {position} (n={len(subset)})")
|
||||||
|
axis.set_xlabel("Atom count")
|
||||||
|
axis.set_ylabel("Count")
|
||||||
|
|
||||||
|
for axis in axes_flat[len(positions) :]:
|
||||||
|
axis.axis("off")
|
||||||
|
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: Atom Count Distribution by Cleavage Position", y=1.02)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_ring_position_diversity(
|
||||||
|
diversity_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
output_path: Path,
|
||||||
|
title_suffix: str = "Side-Chain Diversity by Position",
|
||||||
|
) -> None:
|
||||||
|
if diversity_table.empty:
|
||||||
|
figure, axis = plt.subplots(figsize=(10, 4))
|
||||||
|
axis.text(0.5, 0.5, "No fragments available after filtering.", ha="center", va="center")
|
||||||
|
axis.set_axis_off()
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=0.98)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
return
|
||||||
|
|
||||||
|
figure, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)
|
||||||
|
|
||||||
|
sns.barplot(data=diversity_table, x="cleavage_position", y="unique_fragments", ax=axes[0], color="#2563eb")
|
||||||
|
axes[0].set_title("Unique Fragment Types by Position")
|
||||||
|
axes[0].set_ylabel("Unique fragments")
|
||||||
|
|
||||||
|
sns.barplot(
|
||||||
|
data=diversity_table,
|
||||||
|
x="cleavage_position",
|
||||||
|
y="normalized_shannon_entropy",
|
||||||
|
ax=axes[1],
|
||||||
|
color="#f59e0b",
|
||||||
|
)
|
||||||
|
axes[1].set_title("Normalized Shannon Entropy by Position")
|
||||||
|
axes[1].set_ylabel("Normalized entropy")
|
||||||
|
|
||||||
|
sns.barplot(
|
||||||
|
data=diversity_table,
|
||||||
|
x="cleavage_position",
|
||||||
|
y="mean_pairwise_tanimoto_distance",
|
||||||
|
ax=axes[2],
|
||||||
|
color="#7c3aed",
|
||||||
|
)
|
||||||
|
axes[2].set_title("Mean Pairwise Tanimoto Distance by Position")
|
||||||
|
axes[2].set_ylabel("Mean distance")
|
||||||
|
axes[2].set_xlabel("Cleavage position")
|
||||||
|
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=1.01)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def build_position_count_comparison_table(all_dataframe: pd.DataFrame, filtered_dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
all_stats = (
|
||||||
|
all_dataframe.groupby("cleavage_position")
|
||||||
|
.agg(
|
||||||
|
total_fragments_all=("fragment_smiles_plain", "size"),
|
||||||
|
unique_fragments_all=("fragment_smiles_plain", "nunique"),
|
||||||
|
mean_atom_count_all=("fragment_atom_count", "mean"),
|
||||||
|
)
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
filtered_stats = (
|
||||||
|
filtered_dataframe.groupby("cleavage_position")
|
||||||
|
.agg(
|
||||||
|
total_fragments_gt3=("fragment_smiles_plain", "size"),
|
||||||
|
unique_fragments_gt3=("fragment_smiles_plain", "nunique"),
|
||||||
|
mean_atom_count_gt3=("fragment_atom_count", "mean"),
|
||||||
|
)
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
merged = all_stats.merge(filtered_stats, on="cleavage_position", how="left")
|
||||||
|
integer_columns = [
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"unique_fragments_gt3",
|
||||||
|
]
|
||||||
|
for column in integer_columns:
|
||||||
|
merged[column] = merged[column].fillna(0).astype(int)
|
||||||
|
merged["mean_atom_count_gt3"] = merged["mean_atom_count_gt3"].fillna(0.0)
|
||||||
|
merged["gt3_row_fraction"] = merged["total_fragments_gt3"] / merged["total_fragments_all"]
|
||||||
|
merged["gt3_unique_fraction"] = merged["unique_fragments_gt3"] / merged["unique_fragments_all"]
|
||||||
|
return merged.sort_values("cleavage_position").reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def annotate_fragment_ring_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
result = dataframe.copy()
|
||||||
|
if result.empty:
|
||||||
|
result["fragment_ring_count"] = pd.Series(dtype=int)
|
||||||
|
return result
|
||||||
|
result["fragment_ring_count"] = result["fragment_smiles_plain"].apply(
|
||||||
|
lambda smiles: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(smiles))
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def build_ring_sensitivity_table(filtered_dataframe: pd.DataFrame, position_counts: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
rows: list[dict[str, float | int]] = []
|
||||||
|
annotated = annotate_fragment_ring_counts(filtered_dataframe)
|
||||||
|
|
||||||
|
for position in position_counts["cleavage_position"]:
|
||||||
|
subset = annotated[annotated["cleavage_position"] == position].copy()
|
||||||
|
if subset.empty:
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"cleavage_position": int(position),
|
||||||
|
"total_fragments_gt3": 0,
|
||||||
|
"cyclic_fragment_rows": 0,
|
||||||
|
"acyclic_fragment_rows": 0,
|
||||||
|
"cyclic_row_fraction": 0.0,
|
||||||
|
"acyclic_row_fraction": 0.0,
|
||||||
|
"unique_fragments_gt3": 0,
|
||||||
|
"cyclic_unique_fragments": 0,
|
||||||
|
"acyclic_unique_fragments": 0,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
unique = subset[["fragment_smiles_plain", "fragment_ring_count"]].drop_duplicates()
|
||||||
|
cyclic_rows = int((subset["fragment_ring_count"] > 0).sum())
|
||||||
|
acyclic_rows = int((subset["fragment_ring_count"] == 0).sum())
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"cleavage_position": int(position),
|
||||||
|
"total_fragments_gt3": int(len(subset)),
|
||||||
|
"cyclic_fragment_rows": cyclic_rows,
|
||||||
|
"acyclic_fragment_rows": acyclic_rows,
|
||||||
|
"cyclic_row_fraction": cyclic_rows / len(subset),
|
||||||
|
"acyclic_row_fraction": acyclic_rows / len(subset),
|
||||||
|
"unique_fragments_gt3": int(subset["fragment_smiles_plain"].nunique()),
|
||||||
|
"cyclic_unique_fragments": int((unique["fragment_ring_count"] > 0).sum()),
|
||||||
|
"acyclic_unique_fragments": int((unique["fragment_ring_count"] == 0).sum()),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def build_medchem_hotspot_table(
|
||||||
|
medchem_positions: list[int],
|
||||||
|
position_counts: pd.DataFrame,
|
||||||
|
diversity_all: pd.DataFrame,
|
||||||
|
diversity_gt3: pd.DataFrame,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
frame = pd.DataFrame({"cleavage_position": medchem_positions})
|
||||||
|
merged = frame.merge(position_counts, on="cleavage_position", how="left")
|
||||||
|
merged = merged.merge(
|
||||||
|
diversity_all[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
]
|
||||||
|
].rename(
|
||||||
|
columns={
|
||||||
|
"normalized_shannon_entropy": "normalized_shannon_entropy_all",
|
||||||
|
"mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_all",
|
||||||
|
}
|
||||||
|
),
|
||||||
|
on="cleavage_position",
|
||||||
|
how="left",
|
||||||
|
)
|
||||||
|
merged = merged.merge(
|
||||||
|
diversity_gt3[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
]
|
||||||
|
].rename(
|
||||||
|
columns={
|
||||||
|
"normalized_shannon_entropy": "normalized_shannon_entropy_gt3",
|
||||||
|
"mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_gt3",
|
||||||
|
}
|
||||||
|
),
|
||||||
|
on="cleavage_position",
|
||||||
|
how="left",
|
||||||
|
)
|
||||||
|
numeric_fill_zero = [
|
||||||
|
"total_fragments_all",
|
||||||
|
"unique_fragments_all",
|
||||||
|
"mean_atom_count_all",
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"unique_fragments_gt3",
|
||||||
|
"mean_atom_count_gt3",
|
||||||
|
"gt3_row_fraction",
|
||||||
|
"gt3_unique_fraction",
|
||||||
|
"normalized_shannon_entropy_all",
|
||||||
|
"mean_pairwise_tanimoto_distance_all",
|
||||||
|
"normalized_shannon_entropy_gt3",
|
||||||
|
"mean_pairwise_tanimoto_distance_gt3",
|
||||||
|
]
|
||||||
|
for column in numeric_fill_zero:
|
||||||
|
merged[column] = merged[column].fillna(0.0)
|
||||||
|
|
||||||
|
integer_columns = [
|
||||||
|
"total_fragments_all",
|
||||||
|
"unique_fragments_all",
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"unique_fragments_gt3",
|
||||||
|
]
|
||||||
|
for column in integer_columns:
|
||||||
|
merged[column] = merged[column].astype(int)
|
||||||
|
|
||||||
|
return merged.sort_values("cleavage_position").reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_ring_position_count_comparison(
|
||||||
|
comparison_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
|
||||||
|
|
||||||
|
melted = comparison_table.melt(
|
||||||
|
id_vars="cleavage_position",
|
||||||
|
value_vars=["total_fragments_all", "total_fragments_gt3"],
|
||||||
|
var_name="series",
|
||||||
|
value_name="count",
|
||||||
|
)
|
||||||
|
melted["series"] = melted["series"].map(
|
||||||
|
{
|
||||||
|
"total_fragments_all": "All splice-ready fragments",
|
||||||
|
"total_fragments_gt3": f"Fragments with >={design_min_atoms} heavy atoms",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
|
||||||
|
axes[0].set_title("Fragment Counts by Position")
|
||||||
|
axes[0].set_ylabel("Fragment rows")
|
||||||
|
axes[0].set_xlabel("")
|
||||||
|
axes[0].legend(loc="upper right")
|
||||||
|
|
||||||
|
sns.barplot(data=comparison_table, x="cleavage_position", y="gt3_row_fraction", ax=axes[1], color="#f59e0b")
|
||||||
|
axes[1].set_title(f"Fraction of Position-Specific Fragments with >={design_min_atoms} Heavy Atoms")
|
||||||
|
axes[1].set_ylabel("Fraction")
|
||||||
|
axes[1].set_xlabel("Cleavage position")
|
||||||
|
axes[1].set_ylim(0, 1.0)
|
||||||
|
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: Position Counts Before and After Size Filtering", y=1.01)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_ring_position_atom_count_boxplot(
|
||||||
|
filtered_dataframe: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
if filtered_dataframe.empty:
|
||||||
|
figure, axis = plt.subplots(figsize=(10, 4))
|
||||||
|
axis.text(0.5, 0.5, "No fragments satisfy the size filter.", ha="center", va="center")
|
||||||
|
axis.set_axis_off()
|
||||||
|
figure.suptitle(
|
||||||
|
f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms",
|
||||||
|
y=0.98,
|
||||||
|
)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
return
|
||||||
|
|
||||||
|
figure, axis = plt.subplots(figsize=(14, 6))
|
||||||
|
sns.boxplot(data=filtered_dataframe, x="cleavage_position", y="fragment_atom_count", ax=axis, color="#10b981")
|
||||||
|
axis.set_title(f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms")
|
||||||
|
axis.set_xlabel("Cleavage position")
|
||||||
|
axis.set_ylabel("Fragment atom count")
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_medchem_hotspot_comparison(
|
||||||
|
hotspot_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
figure, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True)
|
||||||
|
positions = hotspot_table["cleavage_position"].astype(str)
|
||||||
|
|
||||||
|
sns.barplot(x=positions, y=hotspot_table["total_fragments_all"], ax=axes[0, 0], color="#2563eb")
|
||||||
|
axes[0, 0].set_title("All splice-ready fragments")
|
||||||
|
axes[0, 0].set_ylabel("Rows")
|
||||||
|
axes[0, 0].set_xlabel("")
|
||||||
|
|
||||||
|
sns.barplot(x=positions, y=hotspot_table["total_fragments_gt3"], ax=axes[0, 1], color="#10b981")
|
||||||
|
axes[0, 1].set_title(f"Fragments with >={design_min_atoms} heavy atoms")
|
||||||
|
axes[0, 1].set_ylabel("Rows")
|
||||||
|
axes[0, 1].set_xlabel("")
|
||||||
|
|
||||||
|
sns.barplot(x=positions, y=hotspot_table["unique_fragments_gt3"], ax=axes[1, 0], color="#f59e0b")
|
||||||
|
axes[1, 0].set_title(f"Unique fragments with >={design_min_atoms} heavy atoms")
|
||||||
|
axes[1, 0].set_ylabel("Unique SMILES")
|
||||||
|
axes[1, 0].set_xlabel("Cleavage position")
|
||||||
|
|
||||||
|
sns.barplot(x=positions, y=hotspot_table["normalized_shannon_entropy_gt3"], ax=axes[1, 1], color="#7c3aed")
|
||||||
|
axes[1, 1].set_title("Normalized Shannon entropy after size filtering")
|
||||||
|
axes[1, 1].set_ylabel("Entropy")
|
||||||
|
axes[1, 1].set_xlabel("Cleavage position")
|
||||||
|
axes[1, 1].set_ylim(0, 1.0)
|
||||||
|
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: Medicinal-Chemistry Hotspot Comparison", y=1.01)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_ring_sensitivity(
|
||||||
|
sensitivity_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
output_path: Path,
|
||||||
|
) -> None:
|
||||||
|
figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
|
||||||
|
|
||||||
|
melted = sensitivity_table.melt(
|
||||||
|
id_vars="cleavage_position",
|
||||||
|
value_vars=["cyclic_fragment_rows", "acyclic_fragment_rows"],
|
||||||
|
var_name="series",
|
||||||
|
value_name="count",
|
||||||
|
)
|
||||||
|
melted["series"] = melted["series"].map(
|
||||||
|
{
|
||||||
|
"cyclic_fragment_rows": "Ring-bearing side chains",
|
||||||
|
"acyclic_fragment_rows": "Acyclic side chains",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
|
||||||
|
axes[0].set_title(f"Fragments with >={design_min_atoms} Heavy Atoms: Cyclic vs Acyclic Side Chains")
|
||||||
|
axes[0].set_ylabel("Fragment rows")
|
||||||
|
axes[0].set_xlabel("")
|
||||||
|
axes[0].legend(loc="upper right")
|
||||||
|
|
||||||
|
sns.barplot(data=sensitivity_table, x="cleavage_position", y="acyclic_row_fraction", ax=axes[1], color="#f59e0b")
|
||||||
|
axes[1].set_title("Acyclic Fraction After Size Filtering")
|
||||||
|
axes[1].set_ylabel("Acyclic fraction")
|
||||||
|
axes[1].set_xlabel("Cleavage position")
|
||||||
|
axes[1].set_ylim(0, 1.0)
|
||||||
|
|
||||||
|
figure.suptitle(f"{ring_size}-Membered Ring: Sensitivity to Ring-Bearing Side Chains", y=1.01)
|
||||||
|
figure.tight_layout()
|
||||||
|
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||||||
|
plt.close(figure)
|
||||||
|
|
||||||
|
|
||||||
|
def format_top_positions(table: pd.DataFrame, sort_column: str, limit: int = 5) -> str:
|
||||||
|
if table.empty:
|
||||||
|
return "No positions available."
|
||||||
|
subset = table.sort_values(sort_column, ascending=False).head(limit)
|
||||||
|
return subset.to_string(index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def build_markdown_report(
|
||||||
|
output_dir: Path,
|
||||||
|
analysis_df: pd.DataFrame,
|
||||||
|
ring_df: pd.DataFrame,
|
||||||
|
filtered_ring_df: pd.DataFrame,
|
||||||
|
filter_candidates: pd.DataFrame,
|
||||||
|
diversity_all: pd.DataFrame,
|
||||||
|
diversity_gt3: pd.DataFrame,
|
||||||
|
position_counts: pd.DataFrame,
|
||||||
|
ring_sensitivity_table: pd.DataFrame,
|
||||||
|
hotspot_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
) -> str:
|
||||||
|
atom_counts = analysis_df["fragment_atom_count"]
|
||||||
|
annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
|
||||||
|
acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
|
||||||
|
acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
|
||||||
|
conservative_filter = filter_candidates.loc[
|
||||||
|
filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
|
||||||
|
].iloc[0]
|
||||||
|
design_filter = filter_candidates.loc[
|
||||||
|
filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
|
||||||
|
].iloc[0]
|
||||||
|
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
|
||||||
|
top_filtered_counts = position_counts[
|
||||||
|
["cleavage_position", "total_fragments_gt3", "gt3_row_fraction"]
|
||||||
|
].sort_values("total_fragments_gt3", ascending=False).head(6)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
"# Fragment Library Analysis Report",
|
||||||
|
"",
|
||||||
|
"## Scope and Dataset",
|
||||||
|
"",
|
||||||
|
f"- Input fragment library: `{analysis_df['source_type'].mode().iloc[0]}` rows merged with `parent_molecules` metadata.",
|
||||||
|
f"- Current validated design library contains **{len(analysis_df):,}** splice-ready fragment rows from **{analysis_df['source_parent_ml_id'].nunique():,}** parent macrolactones.",
|
||||||
|
f"- The {ring_size}-membered subset contains **{len(ring_df):,}** fragment rows from **{ring_df['source_parent_ml_id'].nunique():,}** parent molecules.",
|
||||||
|
f"- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.",
|
||||||
|
"",
|
||||||
|
"## Figure 1. Global Atom-Count Distribution",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
f"- Heavy-atom count is extremely right-skewed: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, median={atom_counts.quantile(0.50):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}.",
|
||||||
|
f"- A conservative cleanup filter of `<= {design_min_atoms - 2}` heavy atoms removes **{int(conservative_filter.removed_rows):,}** rows ({conservative_filter.removed_row_fraction:.1%}) but only **{int(conservative_filter.removed_unique_fragments):,}** unique fragment SMILES ({conservative_filter.removed_unique_fraction:.1%}).",
|
||||||
|
f"- A design-oriented filter aligned with your previous analysis, `<= {design_min_atoms - 1}` heavy atoms, removes **{int(design_filter.removed_rows):,}** rows ({design_filter.removed_row_fraction:.1%}) and **{int(design_filter.removed_unique_fragments):,}** unique fragment SMILES ({design_filter.removed_unique_fraction:.1%}).",
|
||||||
|
f"- Interpretation: `<= {design_min_atoms - 2}` is the safer default for library cleanup, while `> {design_min_atoms - 1}` is useful for positional diversity analysis because it suppresses one- and two-atom noise.",
|
||||||
|
"",
|
||||||
|
"## Figure 2. Ring-Specific Counts Before and After Size Filtering",
|
||||||
|
"",
|
||||||
|
f"",
|
||||||
|
"",
|
||||||
|
f"- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= {design_min_atoms} heavy atoms**.",
|
||||||
|
f"- After size filtering, the most populated {ring_size}-membered positions are:",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
top_filtered_counts.to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"## Figure 3. Atom-Count Spread for Design-Relevant Fragments",
|
||||||
|
"",
|
||||||
|
f"",
|
||||||
|
"",
|
||||||
|
f"- This boxplot only includes fragments with **>= {design_min_atoms} heavy atoms**.",
|
||||||
|
f"- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.",
|
||||||
|
"",
|
||||||
|
"## Figure 4. Position-Wise Diversity Metrics After Filtering",
|
||||||
|
"",
|
||||||
|
f"",
|
||||||
|
"",
|
||||||
|
"- Diversity is evaluated with three complementary metrics:",
|
||||||
|
" 1. `unique_fragments`: raw chemotype count.",
|
||||||
|
" 2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.",
|
||||||
|
" 3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.",
|
||||||
|
"",
|
||||||
|
"Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
format_top_positions(
|
||||||
|
robust_gt3[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments",
|
||||||
|
"unique_fragments",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
"mean_atom_count",
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
limit=8,
|
||||||
|
),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
f"- Within the filtered {ring_size}-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 6, 'total_fragments_gt3'].iloc[0])}** rows) but high chemotype diversity and near-maximal entropy.",
|
||||||
|
f"- **Position 15** is also clearly relevant because it retains **{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 15, 'total_fragments_gt3'].iloc[0])}** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.",
|
||||||
|
"",
|
||||||
|
"## Figure 5. Focus on Medicinal-Chemistry Hotspots",
|
||||||
|
"",
|
||||||
|
f"",
|
||||||
|
"",
|
||||||
|
"This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
hotspot_table[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments_all",
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"unique_fragments_gt3",
|
||||||
|
"normalized_shannon_entropy_gt3",
|
||||||
|
"mean_pairwise_tanimoto_distance_gt3",
|
||||||
|
]
|
||||||
|
].to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.",
|
||||||
|
"- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.",
|
||||||
|
"- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.",
|
||||||
|
"- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.",
|
||||||
|
"",
|
||||||
|
"## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?",
|
||||||
|
"",
|
||||||
|
f"",
|
||||||
|
"",
|
||||||
|
f"- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).",
|
||||||
|
"- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
ring_sensitivity_table[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"cyclic_fragment_rows",
|
||||||
|
"acyclic_fragment_rows",
|
||||||
|
"acyclic_row_fraction",
|
||||||
|
"cyclic_unique_fragments",
|
||||||
|
"acyclic_unique_fragments",
|
||||||
|
]
|
||||||
|
].to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.",
|
||||||
|
"- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.",
|
||||||
|
"- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.",
|
||||||
|
"",
|
||||||
|
"Acyclic-only ranking after removing <=3-heavy-atom fragments:",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
format_top_positions(
|
||||||
|
acyclic_diversity[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments",
|
||||||
|
"unique_fragments",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
"mean_atom_count",
|
||||||
|
]
|
||||||
|
].sort_values("total_fragments", ascending=False),
|
||||||
|
"total_fragments",
|
||||||
|
limit=8,
|
||||||
|
),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.",
|
||||||
|
"",
|
||||||
|
"## Reconciliation With the Previous Conclusion",
|
||||||
|
"",
|
||||||
|
"The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.",
|
||||||
|
"",
|
||||||
|
"- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.",
|
||||||
|
"- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.",
|
||||||
|
"- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.",
|
||||||
|
"",
|
||||||
|
"### Recommended paper-safe wording",
|
||||||
|
"",
|
||||||
|
f"> In the validated MacrolactoneDB fragment library, natural side-chain diversity of {ring_size}-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.",
|
||||||
|
"",
|
||||||
|
"### Practical interpretation for fragment-library design",
|
||||||
|
"",
|
||||||
|
f"- Use `<= {design_min_atoms - 2}` heavy atoms as the **default cleanup filter** for the production fragment library.",
|
||||||
|
f"- Use the stricter `> {design_min_atoms - 1}` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.",
|
||||||
|
f"- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.",
|
||||||
|
"- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.",
|
||||||
|
"- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def build_markdown_report_zh(
|
||||||
|
analysis_df: pd.DataFrame,
|
||||||
|
ring_df: pd.DataFrame,
|
||||||
|
filtered_ring_df: pd.DataFrame,
|
||||||
|
filter_candidates: pd.DataFrame,
|
||||||
|
diversity_gt3: pd.DataFrame,
|
||||||
|
position_counts: pd.DataFrame,
|
||||||
|
ring_sensitivity_table: pd.DataFrame,
|
||||||
|
hotspot_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
) -> str:
|
||||||
|
conservative_filter = filter_candidates.loc[
|
||||||
|
filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
|
||||||
|
].iloc[0]
|
||||||
|
design_filter = filter_candidates.loc[
|
||||||
|
filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
|
||||||
|
].iloc[0]
|
||||||
|
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
|
||||||
|
annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
|
||||||
|
acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
|
||||||
|
acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
|
||||||
|
|
||||||
|
paper_ready_paragraph = (
|
||||||
|
f"在当前验证后的 MacrolactoneDB 片段库中,桥连或双锚点侧链不会进入当前片段库,因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。"
|
||||||
|
f"本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链;然而,许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains(带环单锚点侧链),"
|
||||||
|
f"这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反,当仅保留 >3 个重原子且进一步限定为非环状侧链后,位点排序转而更偏向 15、9、12 和 3 位,"
|
||||||
|
f"说明 13、3、4、12 的优势主要反映了天然带环侧链的富集,而 15 位则更接近药物化学中常见的非环状酰基修饰位点。"
|
||||||
|
f"因此,在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines = [
|
||||||
|
"# 大环内酯碎片库分析报告(中文)",
|
||||||
|
"",
|
||||||
|
"## 数据范围",
|
||||||
|
"",
|
||||||
|
f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录,来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||||||
|
f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录,来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||||||
|
f"- 用于设计相关位点分析的严格子集定义为:片段重原子数 **>= {design_min_atoms}**。",
|
||||||
|
"",
|
||||||
|
"## 全库碎片大小结论",
|
||||||
|
"",
|
||||||
|
f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录({conservative_filter.removed_row_fraction:.1%}),但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段({conservative_filter.removed_unique_fraction:.1%})。",
|
||||||
|
f"- 若用于和你之前的分析口径对齐,则建议采用 `> {design_min_atoms - 1}` 重原子作为设计相关片段集合。此时会删除 **{int(design_filter.removed_rows):,}** 条记录({design_filter.removed_row_fraction:.1%})。",
|
||||||
|
"",
|
||||||
|
"## 16 元环位点结论",
|
||||||
|
"",
|
||||||
|
f"- 在 {ring_size} 元环中,保留所有可拼接片段时,天然侧链多样性较高的位置主要集中在 `13、3/4、12`,并且 6 位也显示出较强的设计相关多样性。",
|
||||||
|
f"- 在 `> {design_min_atoms - 1}` 重原子子集中,样本量足够且多样性较高的位点为:",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
robust_gt3[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments",
|
||||||
|
"unique_fragments",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
"mean_atom_count",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
.sort_values("normalized_shannon_entropy", ascending=False)
|
||||||
|
.head(8)
|
||||||
|
.to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- 其中 6 位最能支持你的药化判断:它不是最高频位点,但在设计相关大侧链中显示出很高的结构多样性。",
|
||||||
|
"- 15 位则更偏向高频但低多样性的酰基修饰位点。",
|
||||||
|
"",
|
||||||
|
"## 桥环 / 稠环干扰的敏感性分析",
|
||||||
|
"",
|
||||||
|
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。",
|
||||||
|
"",
|
||||||
|
"但是,需要额外区分另一类情况:**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环,因此会被保留下来,但片段自身可能包含糖环、杂环或其他环状骨架,仍然会显著影响位点多样性排名。",
|
||||||
|
"",
|
||||||
|
"敏感性分析结果如下:",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
ring_sensitivity_table[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments_gt3",
|
||||||
|
"cyclic_fragment_rows",
|
||||||
|
"acyclic_fragment_rows",
|
||||||
|
"acyclic_row_fraction",
|
||||||
|
"cyclic_unique_fragments",
|
||||||
|
"acyclic_unique_fragments",
|
||||||
|
]
|
||||||
|
].to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- `13` 位:`689/709` 条大侧链片段本身带环,说明该位点的高多样性主要由天然糖基/环状侧链驱动。",
|
||||||
|
"- `4` 位:`263/269` 条大侧链片段带环,几乎同样完全由环状侧链主导。",
|
||||||
|
"- `3` 位:`231/269` 条大侧链片段带环,带环片段占主导。",
|
||||||
|
"- `12` 位:环状与非环状侧链并存,但环状侧链仍占多数。",
|
||||||
|
"- `6` 位:多数保留的大侧链也带环,但仍表现出较强的多样性。",
|
||||||
|
"- `15` 位:几乎全部是非环状侧链,因此更接近半合成药化修饰位点的特征。",
|
||||||
|
"",
|
||||||
|
"## 仅看非环状侧链时的位点排序",
|
||||||
|
"",
|
||||||
|
f"- 当只保留 `> {design_min_atoms - 1}` 重原子且 **不含环** 的侧链时,位点排序明显变化:",
|
||||||
|
"",
|
||||||
|
"```text",
|
||||||
|
acyclic_diversity[
|
||||||
|
[
|
||||||
|
"cleavage_position",
|
||||||
|
"total_fragments",
|
||||||
|
"unique_fragments",
|
||||||
|
"normalized_shannon_entropy",
|
||||||
|
"mean_pairwise_tanimoto_distance",
|
||||||
|
"mean_atom_count",
|
||||||
|
]
|
||||||
|
]
|
||||||
|
.sort_values("total_fragments", ascending=False)
|
||||||
|
.head(8)
|
||||||
|
.to_string(index=False),
|
||||||
|
"```",
|
||||||
|
"",
|
||||||
|
"- 在这个更严格的口径下,`15` 位成为最主要的非环状侧链位点,随后是 `9、12、3` 位。",
|
||||||
|
"- 因此,`13、3、4、12` 的高天然多样性是真实存在的,但它主要表征的是**带环单锚点天然侧链**的富集,而不是桥连片段泄漏。",
|
||||||
|
"",
|
||||||
|
"## 论文可直接引用的中文讨论段落",
|
||||||
|
"",
|
||||||
|
paper_ready_paragraph,
|
||||||
|
"",
|
||||||
|
"## 建议的论文表述方式",
|
||||||
|
"",
|
||||||
|
"- 若讨论天然产物中的侧链多样性,可写为:`16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位,并在 6 位保留较强的设计相关多样性。`",
|
||||||
|
"- 若讨论药化半合成改造热点,可写为:`6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点,其中 6 和 15 位在数据库统计中分别对应高多样性和高频率信号,而 7 和 16 位更多体现为文献指导的探索性位点。`",
|
||||||
|
"- 若专门讨论非环状侧链设计,则应强调:`在排除 <=3 重原子小片段并进一步排除带环侧链后,15 位是最主要的非环状侧链修饰位点。`",
|
||||||
|
"",
|
||||||
|
"## 相关图表",
|
||||||
|
"",
|
||||||
|
"- 全库原子数分布:`fragment_atom_count_distribution.png`",
|
||||||
|
f"- {ring_size} 元环过滤前后位点数量对比:`ring{ring_size}_position_count_comparison.png`",
|
||||||
|
f"- {ring_size} 元环大侧链 boxplot:`ring{ring_size}_position_atom_count_boxplot_gt3.png`",
|
||||||
|
f"- {ring_size} 元环位点多样性图:`ring{ring_size}_position_diversity_gt3.png`",
|
||||||
|
f"- {ring_size} 元环桥环/带环侧链敏感性图:`ring{ring_size}_position_ring_sensitivity.png`",
|
||||||
|
f"- {ring_size} 元环药化热点对比图:`ring{ring_size}_medchem_hotspot_comparison.png`",
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def build_summary_lines(
|
||||||
|
dataframe: pd.DataFrame,
|
||||||
|
ring_dataframe: pd.DataFrame,
|
||||||
|
filtered_ring_dataframe: pd.DataFrame,
|
||||||
|
filter_candidates: pd.DataFrame,
|
||||||
|
diversity_table: pd.DataFrame,
|
||||||
|
diversity_gt3: pd.DataFrame,
|
||||||
|
hotspot_table: pd.DataFrame,
|
||||||
|
ring_size: int,
|
||||||
|
design_min_atoms: int,
|
||||||
|
) -> list[str]:
|
||||||
|
atom_counts = dataframe["fragment_atom_count"]
|
||||||
|
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20] if not diversity_gt3.empty else diversity_gt3
|
||||||
|
lines = [
|
||||||
|
f"Analyzed rows: {len(dataframe)}",
|
||||||
|
f"Unique parent molecules: {dataframe['source_parent_ml_id'].nunique()}",
|
||||||
|
f"Unique fragment smiles: {dataframe['fragment_smiles_plain'].nunique()}",
|
||||||
|
f"Fragment atom count percentiles: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, p50={atom_counts.quantile(0.5):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}",
|
||||||
|
"",
|
||||||
|
"Filter candidates (drop fragments with atom_count <= threshold):",
|
||||||
|
]
|
||||||
|
|
||||||
|
for candidate in filter_candidates.head(5).itertuples(index=False):
|
||||||
|
lines.append(
|
||||||
|
" "
|
||||||
|
f"<= {candidate.drop_if_atom_count_lte}: "
|
||||||
|
f"remove {candidate.removed_rows} rows ({candidate.removed_row_fraction:.1%}), "
|
||||||
|
f"remove {candidate.removed_unique_fragments} unique fragments ({candidate.removed_unique_fraction:.1%})"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
f"Ring {ring_size} rows: {len(ring_dataframe)}",
|
||||||
|
f"Ring {ring_size} unique fragment smiles: {ring_dataframe['fragment_smiles_plain'].nunique()}",
|
||||||
|
f"Ring {ring_size} rows with >= {design_min_atoms} heavy atoms: {len(filtered_ring_dataframe)}",
|
||||||
|
f"Ring {ring_size} unique fragment smiles with >= {design_min_atoms} heavy atoms: {filtered_ring_dataframe['fragment_smiles_plain'].nunique()}",
|
||||||
|
"",
|
||||||
|
f"Ring {ring_size} top positions by normalized Shannon entropy:",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
for row in diversity_table.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
|
||||||
|
lines.append(
|
||||||
|
" "
|
||||||
|
f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
|
||||||
|
f"unique={row.unique_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.extend(["", f"Ring {ring_size} top positions by mean pairwise Tanimoto distance:"])
|
||||||
|
for row in diversity_table.sort_values("mean_pairwise_tanimoto_distance", ascending=False).head(5).itertuples(index=False):
|
||||||
|
lines.append(
|
||||||
|
" "
|
||||||
|
f"Position {row.cleavage_position}: distance={row.mean_pairwise_tanimoto_distance:.3f}, "
|
||||||
|
f"entropy={row.normalized_shannon_entropy:.3f}, atom_count_range={row.atom_count_range}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not robust_gt3.empty:
|
||||||
|
lines.extend(["", f"Ring {ring_size} top filtered positions by normalized Shannon entropy:"])
|
||||||
|
for row in robust_gt3.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
|
||||||
|
lines.append(
|
||||||
|
" "
|
||||||
|
f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
|
||||||
|
f"unique={row.unique_fragments}, total={row.total_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.extend(["", "Medicinal-chemistry hotspot comparison:"])
|
||||||
|
for row in hotspot_table.itertuples(index=False):
|
||||||
|
lines.append(
|
||||||
|
" "
|
||||||
|
f"Position {row.cleavage_position}: all={row.total_fragments_all}, "
|
||||||
|
f">={design_min_atoms} atoms={row.total_fragments_gt3}, "
|
||||||
|
f"unique_filtered={row.unique_fragments_gt3}, "
|
||||||
|
f"entropy_filtered={row.normalized_shannon_entropy_gt3:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
"",
|
||||||
|
"Interpretation note: atom-count spread is only a coarse proxy for diversity.",
|
||||||
|
"Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.",
|
||||||
|
"For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> None:
|
||||||
|
args = build_parser().parse_args(argv)
|
||||||
|
output_dir = Path(args.output_dir)
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
medchem_positions = [int(part.strip()) for part in args.medchem_positions.split(",") if part.strip()]
|
||||||
|
|
||||||
|
sns.set_theme(style="whitegrid")
|
||||||
|
|
||||||
|
dataframe = load_fragment_library_dataset(args.input, args.db)
|
||||||
|
analysis_df = dataframe[
|
||||||
|
dataframe["splice_ready"].fillna(False) & (dataframe["classification"] == "standard_macrolactone")
|
||||||
|
].copy()
|
||||||
|
ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
|
||||||
|
filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()
|
||||||
|
|
||||||
|
if analysis_df.empty:
|
||||||
|
raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
|
||||||
|
if ring_df.empty:
|
||||||
|
raise ValueError(f"No splice-ready fragments found for ring size {args.ring_size}.")
|
||||||
|
|
||||||
|
atom_count_summary = build_fragment_atom_count_summary(analysis_df)
|
||||||
|
atom_count_frequency = build_fragment_atom_count_frequency_table(analysis_df)
|
||||||
|
filter_candidates = build_filter_candidate_table(analysis_df)
|
||||||
|
diversity_table = build_position_diversity_table(ring_df)
|
||||||
|
diversity_gt3 = (
|
||||||
|
build_position_diversity_table(filtered_ring_df) if not filtered_ring_df.empty else diversity_table.iloc[0:0].copy()
|
||||||
|
)
|
||||||
|
position_counts = build_position_count_comparison_table(ring_df, filtered_ring_df)
|
||||||
|
ring_sensitivity = build_ring_sensitivity_table(filtered_ring_df, position_counts)
|
||||||
|
hotspot_table = build_medchem_hotspot_table(medchem_positions, position_counts, diversity_table, diversity_gt3)
|
||||||
|
|
||||||
|
atom_count_summary.to_csv(output_dir / "fragment_atom_count_summary.csv", index=False)
|
||||||
|
atom_count_frequency.to_csv(output_dir / "fragment_atom_count_frequency.csv", index=False)
|
||||||
|
filter_candidates.to_csv(output_dir / "fragment_atom_count_filter_candidates.csv", index=False)
|
||||||
|
diversity_table.to_csv(output_dir / f"ring{args.ring_size}_position_diversity.csv", index=False)
|
||||||
|
diversity_gt3.to_csv(output_dir / f"ring{args.ring_size}_position_diversity_gt3.csv", index=False)
|
||||||
|
position_counts.to_csv(output_dir / f"ring{args.ring_size}_position_count_comparison.csv", index=False)
|
||||||
|
ring_sensitivity.to_csv(output_dir / f"ring{args.ring_size}_position_ring_sensitivity.csv", index=False)
|
||||||
|
hotspot_table.to_csv(output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.csv", index=False)
|
||||||
|
|
||||||
|
plot_fragment_atom_count_distribution(analysis_df, output_dir / "fragment_atom_count_distribution.png")
|
||||||
|
plot_ring_position_count_comparison(
|
||||||
|
position_counts,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
output_dir / f"ring{args.ring_size}_position_count_comparison.png",
|
||||||
|
)
|
||||||
|
plot_ring_position_atom_count_distribution(
|
||||||
|
ring_df, args.ring_size, output_dir / f"ring{args.ring_size}_position_atom_count_distribution.png"
|
||||||
|
)
|
||||||
|
plot_ring_position_atom_count_boxplot(
|
||||||
|
filtered_ring_df,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
output_dir / f"ring{args.ring_size}_position_atom_count_boxplot_gt3.png",
|
||||||
|
)
|
||||||
|
plot_ring_position_diversity(
|
||||||
|
diversity_table,
|
||||||
|
args.ring_size,
|
||||||
|
output_dir / f"ring{args.ring_size}_position_diversity.png",
|
||||||
|
)
|
||||||
|
plot_ring_position_diversity(
|
||||||
|
diversity_gt3,
|
||||||
|
args.ring_size,
|
||||||
|
output_dir / f"ring{args.ring_size}_position_diversity_gt3.png",
|
||||||
|
title_suffix=f"Side-Chain Diversity by Position (>= {args.design_min_atoms} Heavy Atoms)",
|
||||||
|
)
|
||||||
|
plot_ring_sensitivity(
|
||||||
|
ring_sensitivity,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
output_dir / f"ring{args.ring_size}_position_ring_sensitivity.png",
|
||||||
|
)
|
||||||
|
plot_medchem_hotspot_comparison(
|
||||||
|
hotspot_table,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.png",
|
||||||
|
)
|
||||||
|
|
||||||
|
summary_lines = build_summary_lines(
|
||||||
|
analysis_df,
|
||||||
|
ring_df,
|
||||||
|
filtered_ring_df,
|
||||||
|
filter_candidates,
|
||||||
|
diversity_table,
|
||||||
|
diversity_gt3,
|
||||||
|
hotspot_table,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
)
|
||||||
|
(output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
|
||||||
|
report = build_markdown_report(
|
||||||
|
output_dir,
|
||||||
|
analysis_df,
|
||||||
|
ring_df,
|
||||||
|
filtered_ring_df,
|
||||||
|
filter_candidates,
|
||||||
|
diversity_table,
|
||||||
|
diversity_gt3,
|
||||||
|
position_counts,
|
||||||
|
ring_sensitivity,
|
||||||
|
hotspot_table,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
)
|
||||||
|
(output_dir / "fragment_library_analysis_report.md").write_text(report + "\n", encoding="utf-8")
|
||||||
|
report_zh = build_markdown_report_zh(
|
||||||
|
analysis_df,
|
||||||
|
ring_df,
|
||||||
|
filtered_ring_df,
|
||||||
|
filter_candidates,
|
||||||
|
diversity_gt3,
|
||||||
|
position_counts,
|
||||||
|
ring_sensitivity,
|
||||||
|
hotspot_table,
|
||||||
|
args.ring_size,
|
||||||
|
args.design_min_atoms,
|
||||||
|
)
|
||||||
|
(output_dir / "fragment_library_analysis_report_zh.md").write_text(report_zh + "\n", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -50,8 +50,14 @@ def main():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--id-col",
|
"--id-col",
|
||||||
type=str,
|
type=str,
|
||||||
|
default="ml_id",
|
||||||
|
help="ID column name (default: ml_id)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--chembl-id-col",
|
||||||
|
type=str,
|
||||||
default="IDs",
|
default="IDs",
|
||||||
help="ID column name",
|
help="CHEMBL ID column name (default: IDs)",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@@ -69,6 +75,7 @@ def main():
|
|||||||
sample_ratio=args.sample_ratio,
|
sample_ratio=args.sample_ratio,
|
||||||
smiles_col=args.smiles_col,
|
smiles_col=args.smiles_col,
|
||||||
id_col=args.id_col,
|
id_col=args.id_col,
|
||||||
|
chembl_id_col=args.chembl_id_col,
|
||||||
)
|
)
|
||||||
|
|
||||||
results = validator.run(args.input)
|
results = validator.run(args.input)
|
||||||
|
|||||||
@@ -272,6 +272,44 @@ def collect_side_chain_atoms(
|
|||||||
return side_chain_atoms
|
return side_chain_atoms
|
||||||
|
|
||||||
|
|
||||||
|
def find_side_chain_ring_connections(
|
||||||
|
mol: Chem.Mol,
|
||||||
|
side_chain_atoms: Iterable[int],
|
||||||
|
ring_atom_indices: Iterable[int],
|
||||||
|
) -> list[tuple[int, int]]:
|
||||||
|
ring_atom_set = set(ring_atom_indices)
|
||||||
|
connections: set[tuple[int, int]] = set()
|
||||||
|
|
||||||
|
for atom_idx in side_chain_atoms:
|
||||||
|
atom = mol.GetAtomWithIdx(atom_idx)
|
||||||
|
for neighbor in atom.GetNeighbors():
|
||||||
|
neighbor_idx = neighbor.GetIdx()
|
||||||
|
if neighbor_idx in ring_atom_set:
|
||||||
|
connections.add((atom_idx, neighbor_idx))
|
||||||
|
|
||||||
|
return sorted(connections, key=lambda connection: (connection[1], connection[0]))
|
||||||
|
|
||||||
|
|
||||||
|
def collect_fragmentable_side_chain_atoms(
|
||||||
|
mol: Chem.Mol,
|
||||||
|
start_atom_idx: int,
|
||||||
|
ring_atom_indices: Iterable[int],
|
||||||
|
ring_atom_idx: int | None = None,
|
||||||
|
) -> list[int] | None:
|
||||||
|
side_chain_atoms = collect_side_chain_atoms(mol, start_atom_idx, ring_atom_indices)
|
||||||
|
if not side_chain_atoms:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ring_connections = find_side_chain_ring_connections(mol, side_chain_atoms, ring_atom_indices)
|
||||||
|
if len(ring_connections) != 1:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if ring_atom_idx is not None and ring_connections[0][1] != ring_atom_idx:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return side_chain_atoms
|
||||||
|
|
||||||
|
|
||||||
def is_intrinsic_lactone_neighbor(
|
def is_intrinsic_lactone_neighbor(
|
||||||
mol: Chem.Mol,
|
mol: Chem.Mol,
|
||||||
candidate: DetectedMacrolactone,
|
candidate: DetectedMacrolactone,
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ from rdkit.Chem import Descriptors
|
|||||||
from ._core import (
|
from ._core import (
|
||||||
build_fragment_smiles,
|
build_fragment_smiles,
|
||||||
build_numbering_result,
|
build_numbering_result,
|
||||||
collect_side_chain_atoms,
|
collect_fragmentable_side_chain_atoms,
|
||||||
ensure_mol,
|
ensure_mol,
|
||||||
find_macrolactone_candidates,
|
find_macrolactone_candidates,
|
||||||
is_intrinsic_lactone_neighbor,
|
is_intrinsic_lactone_neighbor,
|
||||||
@@ -44,6 +44,8 @@ class MacrolactoneFragmenter:
|
|||||||
fragments: list[SideChainFragment] = []
|
fragments: list[SideChainFragment] = []
|
||||||
|
|
||||||
for position, ring_atom_idx in numbering.position_to_atom.items():
|
for position, ring_atom_idx in numbering.position_to_atom.items():
|
||||||
|
if int(position) <= 2:
|
||||||
|
continue
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
neighbor_idx = neighbor.GetIdx()
|
neighbor_idx = neighbor.GetIdx()
|
||||||
@@ -52,8 +54,13 @@ class MacrolactoneFragmenter:
|
|||||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
if not side_chain_atoms:
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ from typing import Iterable
|
|||||||
|
|
||||||
from rdkit import Chem
|
from rdkit import Chem
|
||||||
|
|
||||||
from .._core import collect_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
|
from .._core import collect_fragmentable_side_chain_atoms, ensure_mol, find_macrolactone_candidates, is_intrinsic_lactone_neighbor
|
||||||
from ..fragmenter import MacrolactoneFragmenter
|
from ..fragmenter import MacrolactoneFragmenter
|
||||||
|
|
||||||
|
|
||||||
@@ -26,22 +26,36 @@ def prepare_macrolactone_scaffold(
|
|||||||
for position in positions:
|
for position in positions:
|
||||||
if position not in numbering.position_to_atom:
|
if position not in numbering.position_to_atom:
|
||||||
raise ValueError(f"Position {position} not found in ring numbering.")
|
raise ValueError(f"Position {position} not found in ring numbering.")
|
||||||
|
if position <= 2:
|
||||||
|
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
|
||||||
ring_atom_idx = numbering.position_to_atom[position]
|
ring_atom_idx = numbering.position_to_atom[position]
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
|
position_dummy_specs: list[tuple[int, int, Chem.BondType]] = []
|
||||||
|
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
neighbor_idx = neighbor.GetIdx()
|
neighbor_idx = neighbor.GetIdx()
|
||||||
if neighbor_idx in ring_atom_set:
|
if neighbor_idx in ring_atom_set:
|
||||||
continue
|
continue
|
||||||
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
if is_intrinsic_lactone_neighbor(mol, candidate, ring_atom_idx, neighbor_idx):
|
||||||
continue
|
continue
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
|
continue
|
||||||
atoms_to_remove.update(side_chain_atoms)
|
atoms_to_remove.update(side_chain_atoms)
|
||||||
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
bond = mol.GetBondBetweenAtoms(ring_atom_idx, neighbor_idx)
|
||||||
if bond is not None:
|
if bond is not None:
|
||||||
dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
|
position_dummy_specs.append((ring_atom_idx, position, bond.GetBondType()))
|
||||||
|
|
||||||
if not any(spec_position == position for _, spec_position, _ in dummy_specs):
|
if not position_dummy_specs:
|
||||||
dummy_specs.append((ring_atom_idx, position, Chem.BondType.SINGLE))
|
raise ValueError(f"Position {position} does not contain a single-anchor fragmentable side chain")
|
||||||
|
if len(position_dummy_specs) > 1:
|
||||||
|
raise ValueError(f"Position {position} contains multiple fragmentable side chains")
|
||||||
|
dummy_specs.extend(position_dummy_specs)
|
||||||
|
|
||||||
rwmol = Chem.RWMol(mol)
|
rwmol = Chem.RWMol(mol)
|
||||||
for ring_atom_idx, position, bond_type in dummy_specs:
|
for ring_atom_idx, position, bond_type in dummy_specs:
|
||||||
|
|||||||
@@ -0,0 +1,183 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
import sqlite3
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from rdkit import Chem, DataStructs
|
||||||
|
from rdkit.Chem import rdFingerprintGenerator
|
||||||
|
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
|
||||||
|
|
||||||
|
|
||||||
|
_FINGERPRINT_GENERATOR = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def _mol_from_smiles(fragment_smiles: str) -> Chem.Mol:
|
||||||
|
mol = Chem.MolFromSmiles(fragment_smiles)
|
||||||
|
if mol is None:
|
||||||
|
raise ValueError(f"Invalid fragment SMILES: {fragment_smiles}")
|
||||||
|
return mol
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def _fingerprint_from_smiles(fragment_smiles: str) -> ExplicitBitVect:
|
||||||
|
return _FINGERPRINT_GENERATOR.GetFingerprint(_mol_from_smiles(fragment_smiles))
|
||||||
|
|
||||||
|
|
||||||
|
def count_non_dummy_atoms(fragment_smiles: str) -> int:
|
||||||
|
return int(_mol_from_smiles(fragment_smiles).GetNumHeavyAtoms())
|
||||||
|
|
||||||
|
|
||||||
|
def annotate_fragment_atom_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
result = dataframe.copy()
|
||||||
|
atom_count_map = {
|
||||||
|
smiles: count_non_dummy_atoms(smiles)
|
||||||
|
for smiles in result["fragment_smiles_plain"].dropna().unique()
|
||||||
|
}
|
||||||
|
result["fragment_atom_count"] = result["fragment_smiles_plain"].map(atom_count_map).astype(int)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def load_fragment_library_dataset(input_path: str | Path, db_path: str | Path) -> pd.DataFrame:
|
||||||
|
fragments = pd.read_csv(input_path)
|
||||||
|
|
||||||
|
with sqlite3.connect(db_path) as connection:
|
||||||
|
parents = pd.read_sql_query(
|
||||||
|
"""
|
||||||
|
SELECT ml_id, classification, ring_size, processing_status, num_sidechains
|
||||||
|
FROM parent_molecules
|
||||||
|
""",
|
||||||
|
connection,
|
||||||
|
)
|
||||||
|
|
||||||
|
merged = fragments.merge(
|
||||||
|
parents,
|
||||||
|
left_on="source_parent_ml_id",
|
||||||
|
right_on="ml_id",
|
||||||
|
how="left",
|
||||||
|
validate="many_to_one",
|
||||||
|
).drop(columns=["ml_id"])
|
||||||
|
|
||||||
|
if merged[["classification", "ring_size"]].isna().any().any():
|
||||||
|
missing = merged[merged["ring_size"].isna() | merged["classification"].isna()]
|
||||||
|
sample_ids = sorted(missing["source_parent_ml_id"].dropna().unique().tolist())[:10]
|
||||||
|
raise ValueError(f"Missing parent metadata for fragment rows: {sample_ids}")
|
||||||
|
|
||||||
|
return annotate_fragment_atom_counts(merged)
|
||||||
|
|
||||||
|
|
||||||
|
def build_fragment_atom_count_summary(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
atom_counts = dataframe["fragment_atom_count"]
|
||||||
|
summary = {
|
||||||
|
"rows": int(len(atom_counts)),
|
||||||
|
"unique_parent_molecules": int(dataframe["source_parent_ml_id"].nunique()),
|
||||||
|
"unique_fragment_smiles": int(dataframe["fragment_smiles_plain"].nunique()),
|
||||||
|
"min_atom_count": int(atom_counts.min()),
|
||||||
|
"p05_atom_count": float(atom_counts.quantile(0.05)),
|
||||||
|
"p25_atom_count": float(atom_counts.quantile(0.25)),
|
||||||
|
"median_atom_count": float(atom_counts.quantile(0.5)),
|
||||||
|
"mean_atom_count": float(atom_counts.mean()),
|
||||||
|
"p75_atom_count": float(atom_counts.quantile(0.75)),
|
||||||
|
"p95_atom_count": float(atom_counts.quantile(0.95)),
|
||||||
|
"max_atom_count": int(atom_counts.max()),
|
||||||
|
}
|
||||||
|
return pd.DataFrame([summary])
|
||||||
|
|
||||||
|
|
||||||
|
def build_fragment_atom_count_frequency_table(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
row_counts = dataframe["fragment_atom_count"].value_counts().sort_index()
|
||||||
|
unique_fragment_counts = (
|
||||||
|
dataframe.drop_duplicates(subset=["fragment_smiles_plain"])["fragment_atom_count"]
|
||||||
|
.value_counts()
|
||||||
|
.sort_index()
|
||||||
|
)
|
||||||
|
frequency = pd.DataFrame({"fragment_atom_count": sorted(row_counts.index.union(unique_fragment_counts.index))})
|
||||||
|
frequency["row_count"] = frequency["fragment_atom_count"].map(row_counts).fillna(0).astype(int)
|
||||||
|
frequency["unique_fragment_count"] = (
|
||||||
|
frequency["fragment_atom_count"].map(unique_fragment_counts).fillna(0).astype(int)
|
||||||
|
)
|
||||||
|
frequency["row_fraction"] = frequency["row_count"] / max(int(len(dataframe)), 1)
|
||||||
|
frequency["unique_fragment_fraction"] = frequency["unique_fragment_count"] / max(
|
||||||
|
int(dataframe["fragment_smiles_plain"].nunique()), 1
|
||||||
|
)
|
||||||
|
return frequency
|
||||||
|
|
||||||
|
|
||||||
|
def build_filter_candidate_table(dataframe: pd.DataFrame, max_threshold: int = 10) -> pd.DataFrame:
|
||||||
|
unique_atom_counts = dataframe.drop_duplicates(subset=["fragment_smiles_plain"])[
|
||||||
|
["fragment_smiles_plain", "fragment_atom_count"]
|
||||||
|
]
|
||||||
|
total_rows = int(len(dataframe))
|
||||||
|
total_unique = int(len(unique_atom_counts))
|
||||||
|
thresholds = range(1, min(max_threshold, int(dataframe["fragment_atom_count"].max())) + 1)
|
||||||
|
|
||||||
|
candidates: list[dict[str, float | int]] = []
|
||||||
|
for threshold in thresholds:
|
||||||
|
removed_rows = int((dataframe["fragment_atom_count"] <= threshold).sum())
|
||||||
|
removed_unique = int((unique_atom_counts["fragment_atom_count"] <= threshold).sum())
|
||||||
|
candidates.append(
|
||||||
|
{
|
||||||
|
"drop_if_atom_count_lte": threshold,
|
||||||
|
"removed_rows": removed_rows,
|
||||||
|
"removed_row_fraction": removed_rows / max(total_rows, 1),
|
||||||
|
"retained_rows": total_rows - removed_rows,
|
||||||
|
"retained_row_fraction": (total_rows - removed_rows) / max(total_rows, 1),
|
||||||
|
"removed_unique_fragments": removed_unique,
|
||||||
|
"removed_unique_fraction": removed_unique / max(total_unique, 1),
|
||||||
|
"retained_unique_fragments": total_unique - removed_unique,
|
||||||
|
"retained_unique_fraction": (total_unique - removed_unique) / max(total_unique, 1),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return pd.DataFrame(candidates)
|
||||||
|
|
||||||
|
|
||||||
|
def _shannon_entropy(smiles_series: pd.Series) -> float:
|
||||||
|
probabilities = smiles_series.value_counts(normalize=True)
|
||||||
|
return float(-(probabilities * probabilities.map(math.log)).sum())
|
||||||
|
|
||||||
|
|
||||||
|
def _mean_pairwise_tanimoto_distance(smiles_values: pd.Series) -> float:
|
||||||
|
unique_smiles = sorted(smiles_values.dropna().unique().tolist())
|
||||||
|
if len(unique_smiles) < 2:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
fingerprints = [_fingerprint_from_smiles(smiles) for smiles in unique_smiles]
|
||||||
|
distances: list[float] = []
|
||||||
|
for index, fingerprint in enumerate(fingerprints[:-1]):
|
||||||
|
similarities = DataStructs.BulkTanimotoSimilarity(fingerprint, fingerprints[index + 1 :])
|
||||||
|
distances.extend(1.0 - similarity for similarity in similarities)
|
||||||
|
return float(sum(distances) / len(distances))
|
||||||
|
|
||||||
|
|
||||||
|
def build_position_diversity_table(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
working = dataframe if "fragment_atom_count" in dataframe.columns else annotate_fragment_atom_counts(dataframe)
|
||||||
|
rows: list[dict[str, float | int]] = []
|
||||||
|
|
||||||
|
for cleavage_position, group in working.groupby("cleavage_position", sort=True):
|
||||||
|
unique_fragments = int(group["fragment_smiles_plain"].nunique())
|
||||||
|
entropy = _shannon_entropy(group["fragment_smiles_plain"])
|
||||||
|
atom_counts = group["fragment_atom_count"]
|
||||||
|
normalized_entropy = entropy / math.log(unique_fragments) if unique_fragments > 1 else 0.0
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"cleavage_position": int(cleavage_position),
|
||||||
|
"total_fragments": int(len(group)),
|
||||||
|
"unique_fragments": unique_fragments,
|
||||||
|
"normalized_unique_ratio": unique_fragments / max(int(len(group)), 1),
|
||||||
|
"shannon_entropy": entropy,
|
||||||
|
"normalized_shannon_entropy": normalized_entropy,
|
||||||
|
"mean_pairwise_tanimoto_distance": _mean_pairwise_tanimoto_distance(group["fragment_smiles_plain"]),
|
||||||
|
"mean_atom_count": float(atom_counts.mean()),
|
||||||
|
"median_atom_count": float(atom_counts.median()),
|
||||||
|
"std_atom_count": float(atom_counts.std(ddof=0)),
|
||||||
|
"iqr_atom_count": float(atom_counts.quantile(0.75) - atom_counts.quantile(0.25)),
|
||||||
|
"min_atom_count": int(atom_counts.min()),
|
||||||
|
"max_atom_count": int(atom_counts.max()),
|
||||||
|
"atom_count_range": int(atom_counts.max() - atom_counts.min()),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import UTC, datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
||||||
@@ -27,7 +27,8 @@ class ParentMolecule(SQLModel, table=True):
|
|||||||
__tablename__ = "parent_molecules"
|
__tablename__ = "parent_molecules"
|
||||||
|
|
||||||
id: Optional[int] = Field(default=None, primary_key=True)
|
id: Optional[int] = Field(default=None, primary_key=True)
|
||||||
source_id: str = Field(index=True)
|
ml_id: str = Field(index=True) # MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
chembl_id: Optional[str] = Field(default=None, index=True) # Original CHEMBL ID
|
||||||
molecule_name: Optional[str] = None
|
molecule_name: Optional[str] = None
|
||||||
smiles: str = Field(index=True)
|
smiles: str = Field(index=True)
|
||||||
classification: str = Field(index=True)
|
classification: str = Field(index=True)
|
||||||
@@ -39,7 +40,7 @@ class ParentMolecule(SQLModel, table=True):
|
|||||||
num_sidechains: Optional[int] = None
|
num_sidechains: Optional[int] = None
|
||||||
cleavage_positions: Optional[str] = None
|
cleavage_positions: Optional[str] = None
|
||||||
numbered_image_path: Optional[str] = None
|
numbered_image_path: Optional[str] = None
|
||||||
created_at: datetime = Field(default_factory=datetime.utcnow)
|
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||||
processed_at: Optional[datetime] = None
|
processed_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
|
||||||
@@ -71,6 +72,8 @@ class SideChainFragment(SQLModel, table=True):
|
|||||||
fragment_smiles_labeled: str
|
fragment_smiles_labeled: str
|
||||||
fragment_smiles_plain: str
|
fragment_smiles_plain: str
|
||||||
dummy_isotope: int
|
dummy_isotope: int
|
||||||
|
has_dummy_atom: bool = Field(default=True)
|
||||||
|
dummy_atom_count: int = Field(default=1)
|
||||||
atom_count: int
|
atom_count: int
|
||||||
heavy_atom_count: int
|
heavy_atom_count: int
|
||||||
molecular_weight: float
|
molecular_weight: float
|
||||||
@@ -78,6 +81,26 @@ class SideChainFragment(SQLModel, table=True):
|
|||||||
image_path: Optional[str] = None
|
image_path: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class FragmentLibraryEntry(SQLModel, table=True):
|
||||||
|
"""Unified fragment library entries."""
|
||||||
|
|
||||||
|
__tablename__ = "fragment_library_entries"
|
||||||
|
|
||||||
|
id: Optional[int] = Field(default=None, primary_key=True)
|
||||||
|
source_type: str = Field(index=True)
|
||||||
|
source_fragment_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
source_parent_ml_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
source_parent_chembl_id: Optional[str] = Field(default=None, index=True)
|
||||||
|
cleavage_position: Optional[int] = Field(default=None, index=True)
|
||||||
|
fragment_smiles_labeled: Optional[str] = None
|
||||||
|
fragment_smiles_plain: str
|
||||||
|
has_dummy_atom: bool = Field(default=False)
|
||||||
|
dummy_atom_count: int = Field(default=0)
|
||||||
|
splice_ready: bool = Field(default=False, index=True)
|
||||||
|
original_bond_type: Optional[str] = None
|
||||||
|
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
||||||
|
|
||||||
|
|
||||||
class ValidationResult(SQLModel, table=True):
|
class ValidationResult(SQLModel, table=True):
|
||||||
"""Manual validation records."""
|
"""Manual validation records."""
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from datetime import datetime
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -12,7 +12,7 @@ from sqlmodel import select
|
|||||||
from macro_lactone_toolkit import MacroLactoneAnalyzer
|
from macro_lactone_toolkit import MacroLactoneAnalyzer
|
||||||
from macro_lactone_toolkit._core import (
|
from macro_lactone_toolkit._core import (
|
||||||
build_numbering_result,
|
build_numbering_result,
|
||||||
collect_side_chain_atoms,
|
collect_fragmentable_side_chain_atoms,
|
||||||
find_macrolactone_candidates,
|
find_macrolactone_candidates,
|
||||||
is_intrinsic_lactone_neighbor,
|
is_intrinsic_lactone_neighbor,
|
||||||
)
|
)
|
||||||
@@ -20,6 +20,7 @@ from macro_lactone_toolkit.validation.database import get_engine, get_session, i
|
|||||||
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
|
from macro_lactone_toolkit.validation.isotope_utils import build_fragment_with_isotope
|
||||||
from macro_lactone_toolkit.validation.models import (
|
from macro_lactone_toolkit.validation.models import (
|
||||||
ClassificationType,
|
ClassificationType,
|
||||||
|
FragmentLibraryEntry,
|
||||||
ParentMolecule,
|
ParentMolecule,
|
||||||
ProcessingStatus,
|
ProcessingStatus,
|
||||||
RingNumbering,
|
RingNumbering,
|
||||||
@@ -41,12 +42,14 @@ class MacrolactoneValidator:
|
|||||||
output_dir: str | Path,
|
output_dir: str | Path,
|
||||||
sample_ratio: float = 0.1,
|
sample_ratio: float = 0.1,
|
||||||
smiles_col: str = "smiles",
|
smiles_col: str = "smiles",
|
||||||
id_col: str = "IDs",
|
id_col: str = "ml_id",
|
||||||
|
chembl_id_col: str = "IDs",
|
||||||
):
|
):
|
||||||
self.output_dir = Path(output_dir)
|
self.output_dir = Path(output_dir)
|
||||||
self.sample_ratio = sample_ratio
|
self.sample_ratio = sample_ratio
|
||||||
self.smiles_col = smiles_col
|
self.smiles_col = smiles_col
|
||||||
self.id_col = id_col
|
self.id_col = id_col
|
||||||
|
self.chembl_id_col = chembl_id_col
|
||||||
|
|
||||||
self.analyzer = MacroLactoneAnalyzer()
|
self.analyzer = MacroLactoneAnalyzer()
|
||||||
|
|
||||||
@@ -78,12 +81,14 @@ class MacrolactoneValidator:
|
|||||||
# Generate outputs
|
# Generate outputs
|
||||||
self._generate_readme()
|
self._generate_readme()
|
||||||
self._generate_summary()
|
self._generate_summary()
|
||||||
|
self._generate_fragment_library()
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def _process_molecule(self, row: pd.Series) -> str:
|
def _process_molecule(self, row: pd.Series) -> str:
|
||||||
"""Process a single molecule. Returns status."""
|
"""Process a single molecule. Returns status."""
|
||||||
source_id = str(row[self.id_col])
|
ml_id = str(row[self.id_col])
|
||||||
|
chembl_id = str(row[self.chembl_id_col]) if self.chembl_id_col in row and pd.notna(row[self.chembl_id_col]) else None
|
||||||
smiles = row[self.smiles_col]
|
smiles = row[self.smiles_col]
|
||||||
name = row.get("molecule_pref_name", None)
|
name = row.get("molecule_pref_name", None)
|
||||||
|
|
||||||
@@ -105,7 +110,8 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Create parent record
|
# Create parent record
|
||||||
parent = ParentMolecule(
|
parent = ParentMolecule(
|
||||||
source_id=source_id,
|
ml_id=ml_id,
|
||||||
|
chembl_id=chembl_id,
|
||||||
molecule_name=name,
|
molecule_name=name,
|
||||||
smiles=smiles,
|
smiles=smiles,
|
||||||
classification=classification,
|
classification=classification,
|
||||||
@@ -124,7 +130,7 @@ class MacrolactoneValidator:
|
|||||||
parent.processing_status = ProcessingStatus.SKIPPED
|
parent.processing_status = ProcessingStatus.SKIPPED
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
self._save_original_image(smiles, source_id, ring_size, classification)
|
self._save_original_image(smiles, ml_id, ring_size, classification)
|
||||||
return "skipped"
|
return "skipped"
|
||||||
|
|
||||||
# Process standard macrolactone
|
# Process standard macrolactone
|
||||||
@@ -134,7 +140,7 @@ class MacrolactoneValidator:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
parent.processing_status = ProcessingStatus.FAILED
|
parent.processing_status = ProcessingStatus.FAILED
|
||||||
parent.error_message = str(e)
|
parent.error_message = str(e)
|
||||||
parent.processed_at = datetime.utcnow()
|
parent.processed_at = datetime.now(UTC)
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
return "failed"
|
return "failed"
|
||||||
@@ -172,7 +178,7 @@ class MacrolactoneValidator:
|
|||||||
|
|
||||||
# Save numbered image
|
# Save numbered image
|
||||||
paths = get_output_paths(
|
paths = get_output_paths(
|
||||||
self.output_dir, parent.source_id, parent.ring_size, "standard_macrolactone"
|
self.output_dir, parent.ml_id, parent.ring_size, "standard_macrolactone"
|
||||||
)
|
)
|
||||||
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
image_path = save_numbered_molecule(smiles, paths["numbered_image"], parent.ring_size)
|
||||||
if image_path:
|
if image_path:
|
||||||
@@ -184,6 +190,8 @@ class MacrolactoneValidator:
|
|||||||
fragment_idx = 0
|
fragment_idx = 0
|
||||||
|
|
||||||
for position, ring_atom_idx in numbering.position_to_atom.items():
|
for position, ring_atom_idx in numbering.position_to_atom.items():
|
||||||
|
if int(position) <= 2:
|
||||||
|
continue
|
||||||
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
ring_atom = mol.GetAtomWithIdx(ring_atom_idx)
|
||||||
|
|
||||||
for neighbor in ring_atom.GetNeighbors():
|
for neighbor in ring_atom.GetNeighbors():
|
||||||
@@ -196,8 +204,13 @@ class MacrolactoneValidator:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# Collect side chain atoms
|
# Collect side chain atoms
|
||||||
side_chain_atoms = collect_side_chain_atoms(mol, neighbor_idx, ring_atom_set)
|
side_chain_atoms = collect_fragmentable_side_chain_atoms(
|
||||||
if not side_chain_atoms:
|
mol=mol,
|
||||||
|
start_atom_idx=neighbor_idx,
|
||||||
|
ring_atom_indices=ring_atom_set,
|
||||||
|
ring_atom_idx=ring_atom_idx,
|
||||||
|
)
|
||||||
|
if side_chain_atoms is None:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Build fragment with isotope tagging
|
# Build fragment with isotope tagging
|
||||||
@@ -217,13 +230,15 @@ class MacrolactoneValidator:
|
|||||||
# Create fragment record
|
# Create fragment record
|
||||||
fragment = SideChainFragment(
|
fragment = SideChainFragment(
|
||||||
parent_id=parent.id,
|
parent_id=parent.id,
|
||||||
fragment_id=f"{parent.source_id}_frag_{fragment_idx}",
|
fragment_id=f"{parent.ml_id}_frag_{fragment_idx}",
|
||||||
cleavage_position=int(position),
|
cleavage_position=int(position),
|
||||||
attachment_atom_idx=ring_atom_idx,
|
attachment_atom_idx=ring_atom_idx,
|
||||||
attachment_atom_symbol=ring_atom.GetSymbol(),
|
attachment_atom_symbol=ring_atom.GetSymbol(),
|
||||||
fragment_smiles_labeled=labeled_smiles,
|
fragment_smiles_labeled=labeled_smiles,
|
||||||
fragment_smiles_plain=plain_smiles,
|
fragment_smiles_plain=plain_smiles,
|
||||||
dummy_isotope=int(position),
|
dummy_isotope=int(position),
|
||||||
|
has_dummy_atom=True,
|
||||||
|
dummy_atom_count=1,
|
||||||
atom_count=atom_count,
|
atom_count=atom_count,
|
||||||
heavy_atom_count=heavy_atom_count,
|
heavy_atom_count=heavy_atom_count,
|
||||||
molecular_weight=round(mw, 4),
|
molecular_weight=round(mw, 4),
|
||||||
@@ -231,11 +246,26 @@ class MacrolactoneValidator:
|
|||||||
)
|
)
|
||||||
session.add(fragment)
|
session.add(fragment)
|
||||||
fragments.append(fragment)
|
fragments.append(fragment)
|
||||||
|
session.add(
|
||||||
|
FragmentLibraryEntry(
|
||||||
|
source_type="validation_extract",
|
||||||
|
source_fragment_id=fragment.fragment_id,
|
||||||
|
source_parent_ml_id=parent.ml_id,
|
||||||
|
source_parent_chembl_id=parent.chembl_id,
|
||||||
|
cleavage_position=int(position),
|
||||||
|
fragment_smiles_labeled=labeled_smiles,
|
||||||
|
fragment_smiles_plain=plain_smiles,
|
||||||
|
has_dummy_atom=True,
|
||||||
|
dummy_atom_count=1,
|
||||||
|
splice_ready=True,
|
||||||
|
original_bond_type=bond_type,
|
||||||
|
)
|
||||||
|
)
|
||||||
fragment_idx += 1
|
fragment_idx += 1
|
||||||
|
|
||||||
# Save fragment images
|
# Save fragment images
|
||||||
if fragments and paths["sidechains_dir"]:
|
if fragments and paths["sidechains_dir"]:
|
||||||
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.source_id)
|
image_paths = save_fragment_images(fragments, paths["sidechains_dir"], parent.ml_id)
|
||||||
for frag, img_path in zip(fragments, image_paths):
|
for frag, img_path in zip(fragments, image_paths):
|
||||||
frag.image_path = img_path
|
frag.image_path = img_path
|
||||||
session.add(frag)
|
session.add(frag)
|
||||||
@@ -244,13 +274,13 @@ class MacrolactoneValidator:
|
|||||||
parent.processing_status = ProcessingStatus.SUCCESS
|
parent.processing_status = ProcessingStatus.SUCCESS
|
||||||
parent.num_sidechains = len(fragments)
|
parent.num_sidechains = len(fragments)
|
||||||
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
|
parent.cleavage_positions = json.dumps([f.cleavage_position for f in fragments])
|
||||||
parent.processed_at = datetime.utcnow()
|
parent.processed_at = datetime.now(UTC)
|
||||||
session.add(parent)
|
session.add(parent)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
def _save_original_image(self, smiles: str, source_id: str, ring_size: int, classification: str):
|
def _save_original_image(self, smiles: str, ml_id: str, ring_size: int, classification: str):
|
||||||
"""Save original image for non-standard molecules."""
|
"""Save original image for non-standard molecules."""
|
||||||
paths = get_output_paths(self.output_dir, source_id, ring_size, classification)
|
paths = get_output_paths(self.output_dir, ml_id, ring_size, classification)
|
||||||
try:
|
try:
|
||||||
from rdkit.Chem import Draw
|
from rdkit.Chem import Draw
|
||||||
|
|
||||||
@@ -272,6 +302,7 @@ This directory contains validation results for MacrolactoneDB 12-20 membered rin
|
|||||||
validation_output/
|
validation_output/
|
||||||
├── README.md # This file
|
├── README.md # This file
|
||||||
├── fragments.db # SQLite database with all data
|
├── fragments.db # SQLite database with all data
|
||||||
|
├── fragment_library.csv # Unified fragment library export
|
||||||
├── summary.csv # Summary of all processed molecules
|
├── summary.csv # Summary of all processed molecules
|
||||||
├── summary_statistics.json # Statistical summary
|
├── summary_statistics.json # Statistical summary
|
||||||
│
|
│
|
||||||
@@ -301,6 +332,7 @@ validation_output/
|
|||||||
- **parent_molecules**: Original molecule information
|
- **parent_molecules**: Original molecule information
|
||||||
- **ring_numberings**: Ring atom numbering details
|
- **ring_numberings**: Ring atom numbering details
|
||||||
- **side_chain_fragments**: Fragmentation results with isotope tags
|
- **side_chain_fragments**: Fragmentation results with isotope tags
|
||||||
|
- **fragment_library_entries**: Unified fragment library rows for downstream design
|
||||||
- **validation_results**: Manual validation records
|
- **validation_results**: Manual validation records
|
||||||
|
|
||||||
### Key Fields
|
### Key Fields
|
||||||
@@ -308,6 +340,8 @@ validation_output/
|
|||||||
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
||||||
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
||||||
- `cleavage_position`: Position on ring where side chain was attached
|
- `cleavage_position`: Position on ring where side chain was attached
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms in the fragment
|
||||||
|
|
||||||
## Ring Numbering Convention
|
## Ring Numbering Convention
|
||||||
|
|
||||||
@@ -325,13 +359,21 @@ Fragments use isotope values to mark cleavage position:
|
|||||||
|
|
||||||
### summary.csv
|
### summary.csv
|
||||||
|
|
||||||
- `source_id`: Original molecule ID from MacrolactoneDB
|
- `ml_id`: MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
- `chembl_id`: Original CHEMBL ID (if available)
|
||||||
- `classification`: Classification result
|
- `classification`: Classification result
|
||||||
- `ring_size`: Detected ring size (12-20)
|
- `ring_size`: Detected ring size (12-20)
|
||||||
- `num_sidechains`: Number of side chains detected
|
- `num_sidechains`: Number of side chains detected
|
||||||
- `cleavage_positions`: JSON array of cleavage positions
|
- `cleavage_positions`: JSON array of cleavage positions
|
||||||
- `processing_status`: pending | success | failed | skipped
|
- `processing_status`: pending | success | failed | skipped
|
||||||
|
|
||||||
|
### fragment_library.csv
|
||||||
|
|
||||||
|
- `source_type`: validation_extract | supplemental (reserved)
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms
|
||||||
|
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
|
||||||
|
|
||||||
## Querying the Database
|
## Querying the Database
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -363,7 +405,8 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
|
|||||||
for p in parents:
|
for p in parents:
|
||||||
data.append({
|
data.append({
|
||||||
"id": p.id,
|
"id": p.id,
|
||||||
"source_id": p.source_id,
|
"ml_id": p.ml_id,
|
||||||
|
"chembl_id": p.chembl_id,
|
||||||
"molecule_name": p.molecule_name,
|
"molecule_name": p.molecule_name,
|
||||||
"smiles": p.smiles,
|
"smiles": p.smiles,
|
||||||
"classification": p.classification,
|
"classification": p.classification,
|
||||||
@@ -395,6 +438,47 @@ sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY
|
|||||||
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
|
print(f"\nSummary saved to {self.output_dir / 'summary.csv'}")
|
||||||
print(f"Statistics: {stats}")
|
print(f"Statistics: {stats}")
|
||||||
|
|
||||||
|
def _generate_fragment_library(self):
|
||||||
|
"""Generate unified fragment library CSV."""
|
||||||
|
columns = [
|
||||||
|
"id",
|
||||||
|
"source_type",
|
||||||
|
"source_fragment_id",
|
||||||
|
"source_parent_ml_id",
|
||||||
|
"source_parent_chembl_id",
|
||||||
|
"cleavage_position",
|
||||||
|
"fragment_smiles_labeled",
|
||||||
|
"fragment_smiles_plain",
|
||||||
|
"has_dummy_atom",
|
||||||
|
"dummy_atom_count",
|
||||||
|
"splice_ready",
|
||||||
|
"original_bond_type",
|
||||||
|
"created_at",
|
||||||
|
]
|
||||||
|
|
||||||
|
with get_session(self.engine) as session:
|
||||||
|
entries = session.exec(select(FragmentLibraryEntry)).all()
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"id": entry.id,
|
||||||
|
"source_type": entry.source_type,
|
||||||
|
"source_fragment_id": entry.source_fragment_id,
|
||||||
|
"source_parent_ml_id": entry.source_parent_ml_id,
|
||||||
|
"source_parent_chembl_id": entry.source_parent_chembl_id,
|
||||||
|
"cleavage_position": entry.cleavage_position,
|
||||||
|
"fragment_smiles_labeled": entry.fragment_smiles_labeled,
|
||||||
|
"fragment_smiles_plain": entry.fragment_smiles_plain,
|
||||||
|
"has_dummy_atom": entry.has_dummy_atom,
|
||||||
|
"dummy_atom_count": entry.dummy_atom_count,
|
||||||
|
"splice_ready": entry.splice_ready,
|
||||||
|
"original_bond_type": entry.original_bond_type,
|
||||||
|
"created_at": entry.created_at,
|
||||||
|
}
|
||||||
|
for entry in entries
|
||||||
|
]
|
||||||
|
|
||||||
|
pd.DataFrame(data, columns=columns).to_csv(self.output_dir / "fragment_library.csv", index=False)
|
||||||
|
|
||||||
|
|
||||||
class MacrolactoneDetectionError(Exception):
|
class MacrolactoneDetectionError(Exception):
|
||||||
"""Raised when macrolactone detection fails."""
|
"""Raised when macrolactone detection fails."""
|
||||||
|
|||||||
102
tests/helpers.py
102
tests/helpers.py
@@ -78,6 +78,108 @@ def build_non_standard_ring_atom_macrolactone(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_fused_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
fused_positions: tuple[int, int] = (5, 6),
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
position_a, position_b = fused_positions
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_bridge_side_chain(
|
||||||
|
ring_size: int = 16,
|
||||||
|
bridge_positions: tuple[int, int] = (5, 8),
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
position_a, position_b = bridge_positions
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
rwmol.AddBond(base.position_to_atom[position_a], atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, base.position_to_atom[position_b], Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_shared_atom_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
position: int = 5,
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
ring_atom_idx = base.position_to_atom[position]
|
||||||
|
|
||||||
|
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_z, ring_atom_idx, Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_macrolactone_with_single_anchor_side_ring(
|
||||||
|
ring_size: int = 16,
|
||||||
|
position: int = 5,
|
||||||
|
side_chains: Mapping[int, str] | None = None,
|
||||||
|
) -> BuiltMacrolactone:
|
||||||
|
base = build_macrolactone(ring_size=ring_size, side_chains=side_chains)
|
||||||
|
rwmol = Chem.RWMol(Chem.Mol(base.mol))
|
||||||
|
|
||||||
|
atom_x = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_y = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
atom_z = rwmol.AddAtom(Chem.Atom("C"))
|
||||||
|
ring_atom_idx = base.position_to_atom[position]
|
||||||
|
|
||||||
|
rwmol.AddBond(ring_atom_idx, atom_x, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_x, atom_y, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_y, atom_z, Chem.BondType.SINGLE)
|
||||||
|
rwmol.AddBond(atom_z, atom_x, Chem.BondType.SINGLE)
|
||||||
|
|
||||||
|
mol = rwmol.GetMol()
|
||||||
|
Chem.SanitizeMol(mol)
|
||||||
|
return BuiltMacrolactone(
|
||||||
|
mol=mol,
|
||||||
|
smiles=Chem.MolToSmiles(mol, isomericSmiles=True),
|
||||||
|
position_to_atom=base.position_to_atom,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
|
def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone:
|
||||||
rwmol = Chem.RWMol()
|
rwmol = Chem.RWMol()
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,12 @@ from rdkit import Chem
|
|||||||
|
|
||||||
from macro_lactone_toolkit import MacrolactoneFragmenter
|
from macro_lactone_toolkit import MacrolactoneFragmenter
|
||||||
|
|
||||||
from .helpers import build_macrolactone
|
from .helpers import (
|
||||||
|
build_macrolactone,
|
||||||
|
build_macrolactone_with_fused_side_ring,
|
||||||
|
build_macrolactone_with_shared_atom_side_ring,
|
||||||
|
build_macrolactone_with_single_anchor_side_ring,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_fragmentation_returns_empty_list_without_sidechains():
|
def test_fragmentation_returns_empty_list_without_sidechains():
|
||||||
@@ -51,3 +56,24 @@ def test_fragmentation_preserves_attachment_bond_type():
|
|||||||
neighbor = dummy_atom.GetNeighbors()[0]
|
neighbor = dummy_atom.GetNeighbors()[0]
|
||||||
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
|
bond = mol.GetBondBetweenAtoms(dummy_atom.GetIdx(), neighbor.GetIdx())
|
||||||
assert bond.GetBondType() == Chem.BondType.DOUBLE
|
assert bond.GetBondType() == Chem.BondType.DOUBLE
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_skips_fused_side_ring_but_keeps_single_anchor_sidechains():
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="fused")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {10}
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_skips_shared_atom_multi_anchor_component():
|
||||||
|
built = build_macrolactone_with_shared_atom_side_ring(side_chains={11: "ethyl"})
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="shared_atom")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {11}
|
||||||
|
|
||||||
|
|
||||||
|
def test_fragmentation_allows_single_anchor_side_ring():
|
||||||
|
built = build_macrolactone_with_single_anchor_side_ring()
|
||||||
|
result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="single_anchor_ring")
|
||||||
|
|
||||||
|
assert {fragment.cleavage_position for fragment in result.fragments} == {5}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import sqlite3
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -133,6 +134,159 @@ def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
|
|||||||
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
|
assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_analyze_validation_fragment_library_script_generates_reports(tmp_path):
|
||||||
|
input_path = tmp_path / "fragment_library.csv"
|
||||||
|
db_path = tmp_path / "fragments.db"
|
||||||
|
output_dir = tmp_path / "fragment_library_analysis"
|
||||||
|
|
||||||
|
pd.DataFrame(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"source_type": "validation_extract",
|
||||||
|
"source_fragment_id": "ML16A_frag_0",
|
||||||
|
"source_parent_ml_id": "ML16A",
|
||||||
|
"source_parent_chembl_id": None,
|
||||||
|
"cleavage_position": 3,
|
||||||
|
"fragment_smiles_labeled": "[3*]C",
|
||||||
|
"fragment_smiles_plain": "*C",
|
||||||
|
"has_dummy_atom": True,
|
||||||
|
"dummy_atom_count": 1,
|
||||||
|
"splice_ready": True,
|
||||||
|
"original_bond_type": "SINGLE",
|
||||||
|
"created_at": "2026-03-19 00:00:00",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"source_type": "validation_extract",
|
||||||
|
"source_fragment_id": "ML16A_frag_1",
|
||||||
|
"source_parent_ml_id": "ML16A",
|
||||||
|
"source_parent_chembl_id": None,
|
||||||
|
"cleavage_position": 3,
|
||||||
|
"fragment_smiles_labeled": "[3*]CC",
|
||||||
|
"fragment_smiles_plain": "*CC",
|
||||||
|
"has_dummy_atom": True,
|
||||||
|
"dummy_atom_count": 1,
|
||||||
|
"splice_ready": True,
|
||||||
|
"original_bond_type": "SINGLE",
|
||||||
|
"created_at": "2026-03-19 00:00:00",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"source_type": "validation_extract",
|
||||||
|
"source_fragment_id": "ML16B_frag_0",
|
||||||
|
"source_parent_ml_id": "ML16B",
|
||||||
|
"source_parent_chembl_id": None,
|
||||||
|
"cleavage_position": 4,
|
||||||
|
"fragment_smiles_labeled": "[4*]O",
|
||||||
|
"fragment_smiles_plain": "*O",
|
||||||
|
"has_dummy_atom": True,
|
||||||
|
"dummy_atom_count": 1,
|
||||||
|
"splice_ready": True,
|
||||||
|
"original_bond_type": "SINGLE",
|
||||||
|
"created_at": "2026-03-19 00:00:00",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"source_type": "validation_extract",
|
||||||
|
"source_fragment_id": "ML14A_frag_0",
|
||||||
|
"source_parent_ml_id": "ML14A",
|
||||||
|
"source_parent_chembl_id": None,
|
||||||
|
"cleavage_position": 3,
|
||||||
|
"fragment_smiles_labeled": "[3*]CCC",
|
||||||
|
"fragment_smiles_plain": "*CCC",
|
||||||
|
"has_dummy_atom": True,
|
||||||
|
"dummy_atom_count": 1,
|
||||||
|
"splice_ready": True,
|
||||||
|
"original_bond_type": "SINGLE",
|
||||||
|
"created_at": "2026-03-19 00:00:00",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
).to_csv(input_path, index=False)
|
||||||
|
|
||||||
|
with sqlite3.connect(db_path) as connection:
|
||||||
|
connection.execute(
|
||||||
|
"""
|
||||||
|
CREATE TABLE parent_molecules (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
ml_id TEXT NOT NULL,
|
||||||
|
chembl_id TEXT,
|
||||||
|
molecule_name TEXT,
|
||||||
|
smiles TEXT NOT NULL,
|
||||||
|
classification TEXT NOT NULL,
|
||||||
|
ring_size INTEGER,
|
||||||
|
primary_reason_code TEXT,
|
||||||
|
primary_reason_message TEXT,
|
||||||
|
processing_status TEXT NOT NULL,
|
||||||
|
error_message TEXT,
|
||||||
|
num_sidechains INTEGER,
|
||||||
|
cleavage_positions TEXT,
|
||||||
|
numbered_image_path TEXT,
|
||||||
|
created_at TEXT NOT NULL,
|
||||||
|
processed_at TEXT
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
connection.executemany(
|
||||||
|
"""
|
||||||
|
INSERT INTO parent_molecules (
|
||||||
|
id, ml_id, chembl_id, molecule_name, smiles, classification, ring_size,
|
||||||
|
primary_reason_code, primary_reason_message, processing_status, error_message,
|
||||||
|
num_sidechains, cleavage_positions, numbered_image_path, created_at, processed_at
|
||||||
|
)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
[
|
||||||
|
(1, "ML16A", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 2, "[3]", None, "2026-03-19 00:00:00", None),
|
||||||
|
(2, "ML16B", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 1, "[4]", None, "2026-03-19 00:00:00", None),
|
||||||
|
(3, "ML14A", None, None, "C1CCCCCCCCCCCC(=O)O1", "standard_macrolactone", 14, None, None, "success", None, 1, "[3]", None, "2026-03-19 00:00:00", None),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
connection.commit()
|
||||||
|
|
||||||
|
completed = run_script(
|
||||||
|
"analyze_validation_fragment_library.py",
|
||||||
|
"--input",
|
||||||
|
str(input_path),
|
||||||
|
"--db",
|
||||||
|
str(db_path),
|
||||||
|
"--output-dir",
|
||||||
|
str(output_dir),
|
||||||
|
"--ring-size",
|
||||||
|
"16",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert completed.returncode == 0, completed.stderr
|
||||||
|
assert (output_dir / "fragment_atom_count_distribution.png").exists()
|
||||||
|
assert (output_dir / "fragment_atom_count_summary.csv").exists()
|
||||||
|
assert (output_dir / "fragment_atom_count_filter_candidates.csv").exists()
|
||||||
|
assert (output_dir / "ring16_position_count_comparison.csv").exists()
|
||||||
|
assert (output_dir / "ring16_position_count_comparison.png").exists()
|
||||||
|
assert (output_dir / "ring16_position_atom_count_distribution.png").exists()
|
||||||
|
assert (output_dir / "ring16_position_atom_count_boxplot_gt3.png").exists()
|
||||||
|
assert (output_dir / "ring16_position_diversity.csv").exists()
|
||||||
|
assert (output_dir / "ring16_position_diversity_gt3.csv").exists()
|
||||||
|
assert (output_dir / "ring16_position_diversity_gt3.png").exists()
|
||||||
|
assert (output_dir / "ring16_position_ring_sensitivity.csv").exists()
|
||||||
|
assert (output_dir / "ring16_position_ring_sensitivity.png").exists()
|
||||||
|
assert (output_dir / "ring16_medchem_hotspot_comparison.csv").exists()
|
||||||
|
assert (output_dir / "ring16_medchem_hotspot_comparison.png").exists()
|
||||||
|
assert (output_dir / "fragment_library_analysis_report.md").exists()
|
||||||
|
assert (output_dir / "fragment_library_analysis_report_zh.md").exists()
|
||||||
|
assert (output_dir / "analysis_summary.txt").exists()
|
||||||
|
|
||||||
|
diversity = pd.read_csv(output_dir / "ring16_position_diversity.csv")
|
||||||
|
assert set(diversity["cleavage_position"]) == {3, 4}
|
||||||
|
assert set(diversity["total_fragments"]) == {1, 2}
|
||||||
|
|
||||||
|
diversity_gt3 = pd.read_csv(output_dir / "ring16_position_diversity_gt3.csv")
|
||||||
|
assert diversity_gt3.empty
|
||||||
|
|
||||||
|
report_zh = (output_dir / "fragment_library_analysis_report_zh.md").read_text(encoding="utf-8")
|
||||||
|
assert "桥连或双锚点侧链不会进入当前片段库" in report_zh
|
||||||
|
assert "cyclic single-anchor side chains" in report_zh
|
||||||
|
|
||||||
|
|
||||||
def test_active_text_assets_do_not_reference_legacy_api():
|
def test_active_text_assets_do_not_reference_legacy_api():
|
||||||
forbidden_patterns = [
|
forbidden_patterns = [
|
||||||
"from src.",
|
"from src.",
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from macro_lactone_toolkit import MacrolactoneFragmenter
|
|||||||
from macro_lactone_toolkit.splicing.engine import splice_molecule
|
from macro_lactone_toolkit.splicing.engine import splice_molecule
|
||||||
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
|
from macro_lactone_toolkit.splicing.scaffold_prep import prepare_macrolactone_scaffold
|
||||||
|
|
||||||
from .helpers import build_macrolactone, canonicalize
|
from .helpers import build_macrolactone, build_macrolactone_with_fused_side_ring, canonicalize
|
||||||
|
|
||||||
|
|
||||||
def test_splice_benzene_methyl():
|
def test_splice_benzene_methyl():
|
||||||
@@ -49,3 +49,14 @@ def test_prepare_scaffold_and_reassemble_fragment():
|
|||||||
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
|
product = splice_molecule(scaffold, Chem.MolFromSmiles(fragment.fragment_smiles_labeled), position=5)
|
||||||
|
|
||||||
assert canonicalize(product) == canonicalize(built.mol)
|
assert canonicalize(product) == canonicalize(built.mol)
|
||||||
|
|
||||||
|
|
||||||
|
def test_prepare_scaffold_rejects_position_without_single_anchor_fragment():
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Position 5 does not contain a single-anchor fragmentable side chain"):
|
||||||
|
prepare_macrolactone_scaffold(
|
||||||
|
built.smiles,
|
||||||
|
positions=[5],
|
||||||
|
ring_size=16,
|
||||||
|
)
|
||||||
|
|||||||
40
tests/validation/test_fragment_library_analysis.py
Normal file
40
tests/validation/test_fragment_library_analysis.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from macro_lactone_toolkit.validation.fragment_library_analysis import (
|
||||||
|
build_position_diversity_table,
|
||||||
|
count_non_dummy_atoms,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_count_non_dummy_atoms_excludes_dummy_atoms() -> None:
|
||||||
|
assert count_non_dummy_atoms("*O") == 1
|
||||||
|
assert count_non_dummy_atoms("*C") == 1
|
||||||
|
assert count_non_dummy_atoms("*C(C)C") == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_position_diversity_table_combines_frequency_and_structure_metrics() -> None:
|
||||||
|
dataframe = pd.DataFrame(
|
||||||
|
[
|
||||||
|
{"cleavage_position": 3, "fragment_smiles_plain": "*C"},
|
||||||
|
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
|
||||||
|
{"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
|
||||||
|
{"cleavage_position": 3, "fragment_smiles_plain": "*O"},
|
||||||
|
{"cleavage_position": 4, "fragment_smiles_plain": "*C"},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
summary = build_position_diversity_table(dataframe).set_index("cleavage_position")
|
||||||
|
|
||||||
|
assert summary.loc[3, "total_fragments"] == 4
|
||||||
|
assert summary.loc[3, "unique_fragments"] == 3
|
||||||
|
assert summary.loc[3, "normalized_unique_ratio"] == pytest.approx(0.75)
|
||||||
|
assert summary.loc[3, "shannon_entropy"] > 0.0
|
||||||
|
assert summary.loc[3, "normalized_shannon_entropy"] > 0.0
|
||||||
|
assert summary.loc[3, "mean_pairwise_tanimoto_distance"] > 0.0
|
||||||
|
|
||||||
|
assert summary.loc[4, "total_fragments"] == 1
|
||||||
|
assert summary.loc[4, "unique_fragments"] == 1
|
||||||
|
assert summary.loc[4, "mean_pairwise_tanimoto_distance"] == 0.0
|
||||||
57
tests/validation/test_validator.py
Normal file
57
tests/validation/test_validator.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from macro_lactone_toolkit.validation.validator import MacrolactoneValidator
|
||||||
|
|
||||||
|
from ..helpers import build_macrolactone_with_fused_side_ring
|
||||||
|
|
||||||
|
|
||||||
|
def test_validator_exports_only_single_anchor_fragments_and_fragment_library(tmp_path):
|
||||||
|
built = build_macrolactone_with_fused_side_ring(side_chains={10: "methyl"})
|
||||||
|
input_path = tmp_path / "input.csv"
|
||||||
|
output_dir = tmp_path / "validation_output"
|
||||||
|
|
||||||
|
pd.DataFrame(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"ml_id": "ML00000001",
|
||||||
|
"IDs": "CHEMBL0001",
|
||||||
|
"smiles": built.smiles,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
).to_csv(input_path, index=False)
|
||||||
|
|
||||||
|
validator = MacrolactoneValidator(output_dir=output_dir, sample_ratio=1.0)
|
||||||
|
results = validator.run(input_path)
|
||||||
|
|
||||||
|
assert results == {"total": 1, "success": 1, "failed": 0, "skipped": 0}
|
||||||
|
|
||||||
|
with sqlite3.connect(output_dir / "fragments.db") as connection:
|
||||||
|
fragments = connection.execute(
|
||||||
|
"SELECT cleavage_position, has_dummy_atom, dummy_atom_count FROM side_chain_fragments"
|
||||||
|
).fetchall()
|
||||||
|
library_entries = connection.execute(
|
||||||
|
"""
|
||||||
|
SELECT source_type, source_parent_ml_id, source_parent_chembl_id,
|
||||||
|
cleavage_position, has_dummy_atom, dummy_atom_count, splice_ready
|
||||||
|
FROM fragment_library_entries
|
||||||
|
"""
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
assert fragments == [(10, 1, 1)]
|
||||||
|
assert library_entries == [("validation_extract", "ML00000001", "CHEMBL0001", 10, 1, 1, 1)]
|
||||||
|
|
||||||
|
summary = pd.read_csv(output_dir / "summary.csv")
|
||||||
|
assert summary.loc[0, "num_sidechains"] == 1
|
||||||
|
assert json.loads(summary.loc[0, "cleavage_positions"]) == [10]
|
||||||
|
|
||||||
|
fragment_library = pd.read_csv(output_dir / "fragment_library.csv")
|
||||||
|
assert fragment_library.loc[0, "source_type"] == "validation_extract"
|
||||||
|
assert int(fragment_library.loc[0, "cleavage_position"]) == 10
|
||||||
|
assert bool(fragment_library.loc[0, "has_dummy_atom"]) is True
|
||||||
|
assert int(fragment_library.loc[0, "dummy_atom_count"]) == 1
|
||||||
|
assert bool(fragment_library.loc[0, "splice_ready"]) is True
|
||||||
97
validation_output/README.md
Normal file
97
validation_output/README.md
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# MacrolactoneDB Validation Output
|
||||||
|
|
||||||
|
This directory contains validation results for MacrolactoneDB 12-20 membered rings.
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
validation_output/
|
||||||
|
├── README.md # This file
|
||||||
|
├── fragments.db # SQLite database with all data
|
||||||
|
├── fragment_library.csv # Unified fragment library export
|
||||||
|
├── summary.csv # Summary of all processed molecules
|
||||||
|
├── summary_statistics.json # Statistical summary
|
||||||
|
│
|
||||||
|
├── ring_size_12/ # 12-membered rings
|
||||||
|
├── ring_size_13/ # 13-membered rings
|
||||||
|
...
|
||||||
|
└── ring_size_20/ # 20-membered rings
|
||||||
|
├── molecules.csv # Molecules in this ring size
|
||||||
|
├── standard/ # Standard macrolactones
|
||||||
|
│ ├── numbered/ # Numbered ring images
|
||||||
|
│ │ └── {id}_numbered.png
|
||||||
|
│ └── sidechains/ # Fragment images
|
||||||
|
│ └── {id}/
|
||||||
|
│ └── {id}_frag_{n}_pos{pos}.png
|
||||||
|
├── non_standard/ # Non-standard macrocycles
|
||||||
|
│ └── original/
|
||||||
|
│ └── {id}_original.png
|
||||||
|
└── rejected/ # Not macrolactones
|
||||||
|
└── original/
|
||||||
|
└── {id}_original.png
|
||||||
|
```
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
### Tables
|
||||||
|
|
||||||
|
- **parent_molecules**: Original molecule information
|
||||||
|
- **ring_numberings**: Ring atom numbering details
|
||||||
|
- **side_chain_fragments**: Fragmentation results with isotope tags
|
||||||
|
- **fragment_library_entries**: Unified fragment library rows for downstream design
|
||||||
|
- **validation_results**: Manual validation records
|
||||||
|
|
||||||
|
### Key Fields
|
||||||
|
|
||||||
|
- `classification`: standard_macrolactone | non_standard_macrocycle | not_macrolactone
|
||||||
|
- `dummy_isotope`: Cleavage position stored as isotope value for reconstruction
|
||||||
|
- `cleavage_position`: Position on ring where side chain was attached
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom for splicing
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms in the fragment
|
||||||
|
|
||||||
|
## Ring Numbering Convention
|
||||||
|
|
||||||
|
1. Position 1 = Lactone carbonyl carbon (C=O)
|
||||||
|
2. Position 2 = Ester oxygen (-O-)
|
||||||
|
3. Positions 3-N = Sequential around ring
|
||||||
|
|
||||||
|
## Isotope Tagging
|
||||||
|
|
||||||
|
Fragments use isotope values to mark cleavage position:
|
||||||
|
- `[5*]CCO` = Fragment from position 5, dummy atom has isotope=5
|
||||||
|
- This enables precise reconstruction during reassembly
|
||||||
|
|
||||||
|
## CSV Columns
|
||||||
|
|
||||||
|
### summary.csv
|
||||||
|
|
||||||
|
- `ml_id`: MacrolactoneDB unique ID (e.g., ML00000001)
|
||||||
|
- `chembl_id`: Original CHEMBL ID (if available)
|
||||||
|
- `classification`: Classification result
|
||||||
|
- `ring_size`: Detected ring size (12-20)
|
||||||
|
- `num_sidechains`: Number of side chains detected
|
||||||
|
- `cleavage_positions`: JSON array of cleavage positions
|
||||||
|
- `processing_status`: pending | success | failed | skipped
|
||||||
|
|
||||||
|
### fragment_library.csv
|
||||||
|
|
||||||
|
- `source_type`: validation_extract | supplemental (reserved)
|
||||||
|
- `has_dummy_atom`: Whether the fragment contains a dummy atom
|
||||||
|
- `dummy_atom_count`: Number of dummy atoms
|
||||||
|
- `splice_ready`: Whether the fragment is directly compatible with single-anchor splicing
|
||||||
|
|
||||||
|
## Querying the Database
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List tables
|
||||||
|
sqlite3 fragments.db ".tables"
|
||||||
|
|
||||||
|
# Get standard macrolactones with fragments
|
||||||
|
sqlite3 fragments.db "SELECT * FROM parent_molecules WHERE classification='standard_macrolactone' LIMIT 5;"
|
||||||
|
|
||||||
|
# Get fragments for a specific molecule
|
||||||
|
sqlite3 fragments.db "SELECT * FROM side_chain_fragments WHERE parent_id=1;"
|
||||||
|
|
||||||
|
# Count by ring size
|
||||||
|
sqlite3 fragments.db "SELECT ring_size, COUNT(*) FROM parent_molecules GROUP BY ring_size;"
|
||||||
|
```
|
||||||
BIN
validation_output/analysis_plots.tar.gz
Normal file
BIN
validation_output/analysis_plots.tar.gz
Normal file
Binary file not shown.
34830
validation_output/fragment_library.csv
Normal file
34830
validation_output/fragment_library.csv
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,47 @@
|
|||||||
|
Analyzed rows: 34829
|
||||||
|
Unique parent molecules: 4451
|
||||||
|
Unique fragment smiles: 1852
|
||||||
|
Fragment atom count percentiles: p05=1.0, p25=1.0, p50=1.0, p75=2.0, p95=14.0
|
||||||
|
|
||||||
|
Filter candidates (drop fragments with atom_count <= threshold):
|
||||||
|
<= 1: remove 23994 rows (68.9%), remove 10 unique fragments (0.5%)
|
||||||
|
<= 2: remove 28069 rows (80.6%), remove 26 unique fragments (1.4%)
|
||||||
|
<= 3: remove 28550 rows (82.0%), remove 52 unique fragments (2.8%)
|
||||||
|
<= 4: remove 29045 rows (83.4%), remove 88 unique fragments (4.8%)
|
||||||
|
<= 5: remove 29272 rows (84.0%), remove 141 unique fragments (7.6%)
|
||||||
|
|
||||||
|
Ring 16 rows: 8108
|
||||||
|
Ring 16 unique fragment smiles: 596
|
||||||
|
Ring 16 rows with >= 4 heavy atoms: 1880
|
||||||
|
Ring 16 unique fragment smiles with >= 4 heavy atoms: 566
|
||||||
|
|
||||||
|
Ring 16 top positions by normalized Shannon entropy:
|
||||||
|
Position 7: entropy=0.857, unique=4, mean_atom_count=2.57
|
||||||
|
Position 13: entropy=0.739, unique=198, mean_atom_count=15.50
|
||||||
|
Position 4: entropy=0.584, unique=70, mean_atom_count=6.89
|
||||||
|
Position 12: entropy=0.490, unique=99, mean_atom_count=3.63
|
||||||
|
Position 3: entropy=0.449, unique=121, mean_atom_count=5.10
|
||||||
|
|
||||||
|
Ring 16 top positions by mean pairwise Tanimoto distance:
|
||||||
|
Position 16: distance=0.901, entropy=0.415, atom_count_range=12
|
||||||
|
Position 10: distance=0.871, entropy=0.077, atom_count_range=13
|
||||||
|
Position 7: distance=0.860, entropy=0.857, atom_count_range=9
|
||||||
|
Position 14: distance=0.848, entropy=0.375, atom_count_range=13
|
||||||
|
Position 12: distance=0.839, entropy=0.490, atom_count_range=20
|
||||||
|
|
||||||
|
Ring 16 top filtered positions by normalized Shannon entropy:
|
||||||
|
Position 6: entropy=0.973, unique=60, total=89, mean_atom_count=12.58
|
||||||
|
Position 12: entropy=0.886, unique=83, total=177, mean_atom_count=10.00
|
||||||
|
Position 3: entropy=0.854, unique=117, total=269, mean_atom_count=15.41
|
||||||
|
Position 13: entropy=0.763, unique=193, total=709, mean_atom_count=18.91
|
||||||
|
Position 9: entropy=0.729, unique=37, total=141, mean_atom_count=7.82
|
||||||
|
|
||||||
|
Medicinal-chemistry hotspot comparison:
|
||||||
|
Position 6: all=536, >=4 atoms=89, unique_filtered=60, entropy_filtered=0.973
|
||||||
|
Position 7: all=23, >=4 atoms=4, unique_filtered=1, entropy_filtered=0.000
|
||||||
|
Position 15: all=747, >=4 atoms=205, unique_filtered=8, entropy_filtered=0.456
|
||||||
|
Position 16: all=135, >=4 atoms=5, unique_filtered=5, entropy_filtered=1.000
|
||||||
|
|
||||||
|
Interpretation note: atom-count spread is only a coarse proxy for diversity.
|
||||||
|
Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.
|
||||||
|
For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
drop_if_atom_count_lte,removed_rows,removed_row_fraction,retained_rows,retained_row_fraction,removed_unique_fragments,removed_unique_fraction,retained_unique_fragments,retained_unique_fraction
|
||||||
|
1,23994,0.6889086680639697,10835,0.31109133193603034,10,0.005399568034557235,1842,0.9946004319654428
|
||||||
|
2,28069,0.80590886904591,6760,0.19409113095408997,26,0.014038876889848811,1826,0.9859611231101512
|
||||||
|
3,28550,0.8197191995176434,6279,0.18028080048235665,52,0.028077753779697623,1800,0.9719222462203023
|
||||||
|
4,29045,0.8339314938700508,5784,0.16606850612994917,88,0.047516198704103674,1764,0.9524838012958964
|
||||||
|
5,29272,0.8404490510781245,5557,0.15955094892187544,141,0.07613390928725702,1711,0.923866090712743
|
||||||
|
6,29353,0.8427746992448821,5476,0.15722530075511787,188,0.10151187904967603,1664,0.8984881209503239
|
||||||
|
7,29446,0.845444887880789,5383,0.154555112119211,248,0.13390928725701945,1604,0.8660907127429806
|
||||||
|
8,29603,0.849952625685492,5226,0.15004737431450801,331,0.1787257019438445,1521,0.8212742980561555
|
||||||
|
9,29812,0.8559533721898418,5017,0.1440466278101582,411,0.22192224622030238,1441,0.7780777537796977
|
||||||
|
10,30146,0.8655430819144966,4683,0.13445691808550345,510,0.275377969762419,1342,0.724622030237581
|
||||||
|
@@ -0,0 +1,46 @@
|
|||||||
|
fragment_atom_count,row_count,unique_fragment_count,row_fraction,unique_fragment_fraction
|
||||||
|
1,23994,10,0.6889086680639697,0.005399568034557235
|
||||||
|
2,4075,16,0.11700020098194033,0.008639308855291577
|
||||||
|
3,481,26,0.013810330471733325,0.014038876889848811
|
||||||
|
4,495,36,0.014212294352407477,0.019438444924406047
|
||||||
|
5,227,53,0.006517557208073732,0.028617710583153346
|
||||||
|
6,81,47,0.002325648166757587,0.025377969762419007
|
||||||
|
7,93,60,0.0026701886359068593,0.032397408207343416
|
||||||
|
8,157,83,0.004507737804702977,0.044816414686825054
|
||||||
|
9,209,80,0.006000746504349824,0.04319654427645788
|
||||||
|
10,334,99,0.009589709724654743,0.05345572354211663
|
||||||
|
11,375,142,0.010766889660914755,0.07667386609071274
|
||||||
|
12,2002,148,0.057480834936403574,0.07991360691144708
|
||||||
|
13,382,100,0.01096787160125183,0.05399568034557235
|
||||||
|
14,504,133,0.014470699704269431,0.07181425485961124
|
||||||
|
15,230,97,0.006603692325361049,0.052375809935205186
|
||||||
|
16,110,79,0.003158287633868328,0.04265658747300216
|
||||||
|
17,122,71,0.0035028281030176005,0.03833693304535637
|
||||||
|
18,128,81,0.003675098337592236,0.04373650107991361
|
||||||
|
19,109,53,0.003129575928105889,0.028617710583153346
|
||||||
|
20,30,26,0.0008613511728731804,0.014038876889848811
|
||||||
|
21,56,48,0.0016078555226966035,0.02591792656587473
|
||||||
|
22,24,21,0.0006890809382985443,0.011339092872570195
|
||||||
|
23,137,42,0.0039335036894541904,0.02267818574514039
|
||||||
|
24,32,29,0.000918774584398059,0.01565874730021598
|
||||||
|
25,26,21,0.000746504349823423,0.011339092872570195
|
||||||
|
26,50,36,0.0014355852881219673,0.019438444924406047
|
||||||
|
27,69,29,0.001981107697608315,0.01565874730021598
|
||||||
|
28,41,23,0.0011771799362600133,0.012419006479481642
|
||||||
|
29,84,34,0.002411783284044905,0.0183585313174946
|
||||||
|
30,30,21,0.0008613511728731804,0.011339092872570195
|
||||||
|
31,18,13,0.0005168107037239083,0.007019438444924406
|
||||||
|
32,33,23,0.0009474862901604985,0.012419006479481642
|
||||||
|
33,14,10,0.0004019638806741509,0.005399568034557235
|
||||||
|
34,11,9,0.0003158287633868328,0.004859611231101512
|
||||||
|
35,14,10,0.0004019638806741509,0.005399568034557235
|
||||||
|
36,8,7,0.00022969364609951476,0.003779697624190065
|
||||||
|
37,8,8,0.00022969364609951476,0.004319654427645789
|
||||||
|
38,8,7,0.00022969364609951476,0.003779697624190065
|
||||||
|
39,14,9,0.0004019638806741509,0.004859611231101512
|
||||||
|
40,2,2,5.742341152487869e-05,0.0010799136069114472
|
||||||
|
41,2,2,5.742341152487869e-05,0.0010799136069114472
|
||||||
|
43,2,2,5.742341152487869e-05,0.0010799136069114472
|
||||||
|
44,2,2,5.742341152487869e-05,0.0010799136069114472
|
||||||
|
45,3,3,8.613511728731804e-05,0.0016198704103671706
|
||||||
|
48,3,1,8.613511728731804e-05,0.0005399568034557236
|
||||||
|
@@ -0,0 +1,2 @@
|
|||||||
|
rows,unique_parent_molecules,unique_fragment_smiles,min_atom_count,p05_atom_count,p25_atom_count,median_atom_count,mean_atom_count,p75_atom_count,p95_atom_count,max_atom_count
|
||||||
|
34829,4451,1852,1,1.0,1.0,1.0,3.3206523299549224,2.0,14.0,48
|
||||||
|
@@ -0,0 +1,151 @@
|
|||||||
|
# Fragment Library Analysis Report
|
||||||
|
|
||||||
|
## Scope and Dataset
|
||||||
|
|
||||||
|
- Input fragment library: `validation_extract` rows merged with `parent_molecules` metadata.
|
||||||
|
- Current validated design library contains **34,829** splice-ready fragment rows from **4,451** parent macrolactones.
|
||||||
|
- The 16-membered subset contains **8,108** fragment rows from **1,105** parent molecules.
|
||||||
|
- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.
|
||||||
|
|
||||||
|
## Figure 1. Global Atom-Count Distribution
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- Heavy-atom count is extremely right-skewed: p05=1.0, p25=1.0, median=1.0, p75=2.0, p95=14.0.
|
||||||
|
- A conservative cleanup filter of `<= 2` heavy atoms removes **28,069** rows (80.6%) but only **26** unique fragment SMILES (1.4%).
|
||||||
|
- A design-oriented filter aligned with your previous analysis, `<= 3` heavy atoms, removes **28,550** rows (82.0%) and **52** unique fragment SMILES (2.8%).
|
||||||
|
- Interpretation: `<= 2` is the safer default for library cleanup, while `> 3` is useful for positional diversity analysis because it suppresses one- and two-atom noise.
|
||||||
|
|
||||||
|
## Figure 2. Ring-Specific Counts Before and After Size Filtering
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= 4 heavy atoms**.
|
||||||
|
- After size filtering, the most populated 16-membered positions are:
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments_gt3 gt3_row_fraction
|
||||||
|
13 709 0.809361
|
||||||
|
3 269 0.256679
|
||||||
|
4 269 0.452101
|
||||||
|
15 205 0.274431
|
||||||
|
12 177 0.190323
|
||||||
|
9 141 0.192098
|
||||||
|
```
|
||||||
|
|
||||||
|
## Figure 3. Atom-Count Spread for Design-Relevant Fragments
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- This boxplot only includes fragments with **>= 4 heavy atoms**.
|
||||||
|
- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.
|
||||||
|
|
||||||
|
## Figure 4. Position-Wise Diversity Metrics After Filtering
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- Diversity is evaluated with three complementary metrics:
|
||||||
|
1. `unique_fragments`: raw chemotype count.
|
||||||
|
2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.
|
||||||
|
3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.
|
||||||
|
|
||||||
|
Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments unique_fragments normalized_shannon_entropy mean_pairwise_tanimoto_distance mean_atom_count
|
||||||
|
6 89 60 0.973126 0.780092 12.584270
|
||||||
|
12 177 83 0.885717 0.825769 10.000000
|
||||||
|
3 269 117 0.854464 0.764966 15.405204
|
||||||
|
13 709 193 0.763147 0.565162 18.906911
|
||||||
|
9 141 37 0.729256 0.804709 7.815603
|
||||||
|
4 269 63 0.585202 0.701620 13.375465
|
||||||
|
15 205 8 0.455556 0.769222 4.692683
|
||||||
|
```
|
||||||
|
|
||||||
|
- Within the filtered 16-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**89** rows) but high chemotype diversity and near-maximal entropy.
|
||||||
|
- **Position 15** is also clearly relevant because it retains **205** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.
|
||||||
|
|
||||||
|
## Figure 5. Focus on Medicinal-Chemistry Hotspots
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments_all total_fragments_gt3 unique_fragments_gt3 normalized_shannon_entropy_gt3 mean_pairwise_tanimoto_distance_gt3
|
||||||
|
6 536 89 60 0.973126 0.780092
|
||||||
|
7 23 4 1 0.000000 0.000000
|
||||||
|
15 747 205 8 0.455556 0.769222
|
||||||
|
16 135 5 5 1.000000 0.891010
|
||||||
|
```
|
||||||
|
|
||||||
|
- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.
|
||||||
|
- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.
|
||||||
|
- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.
|
||||||
|
- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.
|
||||||
|
|
||||||
|
## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).
|
||||||
|
- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments_gt3 cyclic_fragment_rows acyclic_fragment_rows acyclic_row_fraction cyclic_unique_fragments acyclic_unique_fragments
|
||||||
|
3 269 231 38 0.141264 94 23
|
||||||
|
4 269 263 6 0.022305 57 6
|
||||||
|
5 0 0 0 0.000000 0 0
|
||||||
|
6 89 82 7 0.078652 54 6
|
||||||
|
7 4 0 4 1.000000 0 1
|
||||||
|
8 0 0 0 0.000000 0 0
|
||||||
|
9 141 61 80 0.567376 29 8
|
||||||
|
10 4 4 0 0.000000 4 0
|
||||||
|
11 7 0 7 1.000000 0 5
|
||||||
|
12 177 116 61 0.344633 38 45
|
||||||
|
13 709 689 20 0.028209 187 6
|
||||||
|
14 1 1 0 0.000000 1 0
|
||||||
|
15 205 4 201 0.980488 3 5
|
||||||
|
16 5 4 1 0.200000 4 1
|
||||||
|
```
|
||||||
|
|
||||||
|
- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.
|
||||||
|
- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.
|
||||||
|
- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.
|
||||||
|
|
||||||
|
Acyclic-only ranking after removing <=3-heavy-atom fragments:
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments unique_fragments normalized_shannon_entropy mean_pairwise_tanimoto_distance mean_atom_count
|
||||||
|
15 201 5 0.526528 0.633544 4.562189
|
||||||
|
9 80 8 0.527036 0.607697 4.387500
|
||||||
|
12 61 45 0.976171 0.792308 8.409836
|
||||||
|
3 38 23 0.949162 0.677847 12.236842
|
||||||
|
13 20 6 0.900676 0.749984 7.500000
|
||||||
|
6 7 6 0.975504 0.591713 7.571429
|
||||||
|
11 7 5 0.962961 0.695245 14.857143
|
||||||
|
4 6 6 1.000000 0.730513 5.500000
|
||||||
|
```
|
||||||
|
|
||||||
|
- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.
|
||||||
|
|
||||||
|
## Reconciliation With the Previous Conclusion
|
||||||
|
|
||||||
|
The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.
|
||||||
|
|
||||||
|
- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.
|
||||||
|
- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.
|
||||||
|
- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.
|
||||||
|
|
||||||
|
### Recommended paper-safe wording
|
||||||
|
|
||||||
|
> In the validated MacrolactoneDB fragment library, natural side-chain diversity of 16-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.
|
||||||
|
|
||||||
|
### Practical interpretation for fragment-library design
|
||||||
|
|
||||||
|
- Use `<= 2` heavy atoms as the **default cleanup filter** for the production fragment library.
|
||||||
|
- Use the stricter `> 3` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.
|
||||||
|
- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.
|
||||||
|
- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.
|
||||||
|
- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.
|
||||||
|
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
# 大环内酯碎片库分析报告(中文)
|
||||||
|
|
||||||
|
## 数据范围
|
||||||
|
|
||||||
|
- 当前验证后的可拼接碎片库包含 **34,829** 条片段记录,来源于 **4,451** 个母体分子。
|
||||||
|
- 其中 16 元环子集包含 **8,108** 条片段记录,来源于 **1,105** 个母体分子。
|
||||||
|
- 用于设计相关位点分析的严格子集定义为:片段重原子数 **>= 4**。
|
||||||
|
|
||||||
|
## 全库碎片大小结论
|
||||||
|
|
||||||
|
- 默认清洗阈值建议使用 `<= 2` 重原子删除。该阈值会删除 **28,069** 条记录(80.6%),但仅删除 **26** 个唯一片段(1.4%)。
|
||||||
|
- 若用于和你之前的分析口径对齐,则建议采用 `> 3` 重原子作为设计相关片段集合。此时会删除 **28,550** 条记录(82.0%)。
|
||||||
|
|
||||||
|
## 16 元环位点结论
|
||||||
|
|
||||||
|
- 在 16 元环中,保留所有可拼接片段时,天然侧链多样性较高的位置主要集中在 `13、3/4、12`,并且 6 位也显示出较强的设计相关多样性。
|
||||||
|
- 在 `> 3` 重原子子集中,样本量足够且多样性较高的位点为:
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments unique_fragments normalized_shannon_entropy mean_pairwise_tanimoto_distance mean_atom_count
|
||||||
|
6 89 60 0.973126 0.780092 12.584270
|
||||||
|
12 177 83 0.885717 0.825769 10.000000
|
||||||
|
3 269 117 0.854464 0.764966 15.405204
|
||||||
|
13 709 193 0.763147 0.565162 18.906911
|
||||||
|
9 141 37 0.729256 0.804709 7.815603
|
||||||
|
4 269 63 0.585202 0.701620 13.375465
|
||||||
|
15 205 8 0.455556 0.769222 4.692683
|
||||||
|
```
|
||||||
|
|
||||||
|
- 其中 6 位最能支持你的药化判断:它不是最高频位点,但在设计相关大侧链中显示出很高的结构多样性。
|
||||||
|
- 15 位则更偏向高频但低多样性的酰基修饰位点。
|
||||||
|
|
||||||
|
## 桥环 / 稠环干扰的敏感性分析
|
||||||
|
|
||||||
|
桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。
|
||||||
|
|
||||||
|
但是,需要额外区分另一类情况:**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环,因此会被保留下来,但片段自身可能包含糖环、杂环或其他环状骨架,仍然会显著影响位点多样性排名。
|
||||||
|
|
||||||
|
敏感性分析结果如下:
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments_gt3 cyclic_fragment_rows acyclic_fragment_rows acyclic_row_fraction cyclic_unique_fragments acyclic_unique_fragments
|
||||||
|
3 269 231 38 0.141264 94 23
|
||||||
|
4 269 263 6 0.022305 57 6
|
||||||
|
5 0 0 0 0.000000 0 0
|
||||||
|
6 89 82 7 0.078652 54 6
|
||||||
|
7 4 0 4 1.000000 0 1
|
||||||
|
8 0 0 0 0.000000 0 0
|
||||||
|
9 141 61 80 0.567376 29 8
|
||||||
|
10 4 4 0 0.000000 4 0
|
||||||
|
11 7 0 7 1.000000 0 5
|
||||||
|
12 177 116 61 0.344633 38 45
|
||||||
|
13 709 689 20 0.028209 187 6
|
||||||
|
14 1 1 0 0.000000 1 0
|
||||||
|
15 205 4 201 0.980488 3 5
|
||||||
|
16 5 4 1 0.200000 4 1
|
||||||
|
```
|
||||||
|
|
||||||
|
- `13` 位:`689/709` 条大侧链片段本身带环,说明该位点的高多样性主要由天然糖基/环状侧链驱动。
|
||||||
|
- `4` 位:`263/269` 条大侧链片段带环,几乎同样完全由环状侧链主导。
|
||||||
|
- `3` 位:`231/269` 条大侧链片段带环,带环片段占主导。
|
||||||
|
- `12` 位:环状与非环状侧链并存,但环状侧链仍占多数。
|
||||||
|
- `6` 位:多数保留的大侧链也带环,但仍表现出较强的多样性。
|
||||||
|
- `15` 位:几乎全部是非环状侧链,因此更接近半合成药化修饰位点的特征。
|
||||||
|
|
||||||
|
## 仅看非环状侧链时的位点排序
|
||||||
|
|
||||||
|
- 当只保留 `> 3` 重原子且 **不含环** 的侧链时,位点排序明显变化:
|
||||||
|
|
||||||
|
```text
|
||||||
|
cleavage_position total_fragments unique_fragments normalized_shannon_entropy mean_pairwise_tanimoto_distance mean_atom_count
|
||||||
|
15 201 5 0.526528 0.633544 4.562189
|
||||||
|
9 80 8 0.527036 0.607697 4.387500
|
||||||
|
12 61 45 0.976171 0.792308 8.409836
|
||||||
|
3 38 23 0.949162 0.677847 12.236842
|
||||||
|
13 20 6 0.900676 0.749984 7.500000
|
||||||
|
6 7 6 0.975504 0.591713 7.571429
|
||||||
|
11 7 5 0.962961 0.695245 14.857143
|
||||||
|
4 6 6 1.000000 0.730513 5.500000
|
||||||
|
```
|
||||||
|
|
||||||
|
- 在这个更严格的口径下,`15` 位成为最主要的非环状侧链位点,随后是 `9、12、3` 位。
|
||||||
|
- 因此,`13、3、4、12` 的高天然多样性是真实存在的,但它主要表征的是**带环单锚点天然侧链**的富集,而不是桥连片段泄漏。
|
||||||
|
|
||||||
|
## 论文可直接引用的中文讨论段落
|
||||||
|
|
||||||
|
在当前验证后的 MacrolactoneDB 片段库中,桥连或双锚点侧链不会进入当前片段库,因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链;然而,许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains(带环单锚点侧链),这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反,当仅保留 >3 个重原子且进一步限定为非环状侧链后,位点排序转而更偏向 15、9、12 和 3 位,说明 13、3、4、12 的优势主要反映了天然带环侧链的富集,而 15 位则更接近药物化学中常见的非环状酰基修饰位点。因此,在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。
|
||||||
|
|
||||||
|
## 建议的论文表述方式
|
||||||
|
|
||||||
|
- 若讨论天然产物中的侧链多样性,可写为:`16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位,并在 6 位保留较强的设计相关多样性。`
|
||||||
|
- 若讨论药化半合成改造热点,可写为:`6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点,其中 6 和 15 位在数据库统计中分别对应高多样性和高频率信号,而 7 和 16 位更多体现为文献指导的探索性位点。`
|
||||||
|
- 若专门讨论非环状侧链设计,则应强调:`在排除 <=3 重原子小片段并进一步排除带环侧链后,15 位是最主要的非环状侧链修饰位点。`
|
||||||
|
|
||||||
|
## 相关图表
|
||||||
|
|
||||||
|
- 全库原子数分布:`fragment_atom_count_distribution.png`
|
||||||
|
- 16 元环过滤前后位点数量对比:`ring16_position_count_comparison.png`
|
||||||
|
- 16 元环大侧链 boxplot:`ring16_position_atom_count_boxplot_gt3.png`
|
||||||
|
- 16 元环位点多样性图:`ring16_position_diversity_gt3.png`
|
||||||
|
- 16 元环桥环/带环侧链敏感性图:`ring16_position_ring_sensitivity.png`
|
||||||
|
- 16 元环药化热点对比图:`ring16_medchem_hotspot_comparison.png`
|
||||||
|
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
cleavage_position,total_fragments_all,unique_fragments_all,mean_atom_count_all,total_fragments_gt3,unique_fragments_gt3,mean_atom_count_gt3,gt3_row_fraction,gt3_unique_fraction,normalized_shannon_entropy_all,mean_pairwise_tanimoto_distance_all,normalized_shannon_entropy_gt3,mean_pairwise_tanimoto_distance_gt3
|
||||||
|
6,536,68,2.9402985074626864,89,60,12.584269662921349,0.166044776119403,0.8823529411764706,0.30950388972503856,0.8133239394372517,0.9731262721536741,0.7800916405422387
|
||||||
|
7,23,4,2.5652173913043477,4,1,10.0,0.17391304347826086,0.25,0.8567985762221871,0.86,0.0,0.0
|
||||||
|
15,747,13,2.0174029451137887,205,8,4.692682926829268,0.2744310575635877,0.6153846153846154,0.3487511597393803,0.8168500968741627,0.45555578681139725,0.7692219308030589
|
||||||
|
16,135,8,1.9851851851851852,5,5,8.4,0.037037037037037035,0.625,0.4148320683071004,0.9006253315127155,1.0000000000000002,0.8910101403362273
|
||||||
|
@@ -0,0 +1,15 @@
|
|||||||
|
cleavage_position,total_fragments_all,unique_fragments_all,mean_atom_count_all,total_fragments_gt3,unique_fragments_gt3,mean_atom_count_gt3,gt3_row_fraction,gt3_unique_fraction
|
||||||
|
3,1048,121,5.097328244274809,269,117,15.405204460966543,0.2566793893129771,0.9669421487603306
|
||||||
|
4,595,70,6.894117647058824,269,63,13.37546468401487,0.45210084033613446,0.9
|
||||||
|
5,54,2,1.0,0,0,0.0,0.0,0.0
|
||||||
|
6,536,68,2.9402985074626864,89,60,12.584269662921349,0.166044776119403,0.8823529411764706
|
||||||
|
7,23,4,2.5652173913043477,4,1,10.0,0.17391304347826086,0.25
|
||||||
|
8,123,4,1.016260162601626,0,0,0.0,0.0,0.0
|
||||||
|
9,734,41,2.310626702997275,141,37,7.815602836879433,0.19209809264305178,0.9024390243902439
|
||||||
|
10,993,7,1.0433031218529707,4,4,11.5,0.004028197381671702,0.5714285714285714
|
||||||
|
11,249,8,1.3895582329317269,7,5,14.857142857142858,0.028112449799196786,0.625
|
||||||
|
12,930,99,3.6268817204301076,177,83,10.0,0.19032258064516128,0.8383838383838383
|
||||||
|
13,876,198,15.501141552511415,709,193,18.90691114245416,0.8093607305936074,0.9747474747474747
|
||||||
|
14,1065,6,1.267605633802817,1,1,14.0,0.0009389671361502347,0.16666666666666666
|
||||||
|
15,747,13,2.0174029451137887,205,8,4.692682926829268,0.2744310575635877,0.6153846153846154
|
||||||
|
16,135,8,1.9851851851851852,5,5,8.4,0.037037037037037035,0.625
|
||||||
|
@@ -0,0 +1,15 @@
|
|||||||
|
cleavage_position,total_fragments,unique_fragments,normalized_unique_ratio,shannon_entropy,normalized_shannon_entropy,mean_pairwise_tanimoto_distance,mean_atom_count,median_atom_count,std_atom_count,iqr_atom_count,min_atom_count,max_atom_count,atom_count_range
|
||||||
|
3,1048,121,0.11545801526717557,2.1513093369119027,0.4485828387328402,0.7727580364590483,5.097328244274809,2.0,7.125515417214252,5.0,1,39,38
|
||||||
|
4,595,70,0.11764705882352941,2.4817848918166048,0.584156213064148,0.7382343170047687,6.894117647058824,2.0,6.017946467329047,12.5,1,18,17
|
||||||
|
5,54,2,0.037037037037037035,0.0922160573371918,0.13303964861069892,0.8,1.0,1.0,0.0,0.0,1,1,0
|
||||||
|
6,536,68,0.12686567164179105,1.3059540474767763,0.30950388972503856,0.8133239394372517,2.9402985074626864,1.0,4.444116006032291,0.0,1,20,19
|
||||||
|
7,23,4,0.17391304347826086,1.1877750348323688,0.8567985762221871,0.86,2.5652173913043477,1.0,3.4113122166840055,0.0,1,10,9
|
||||||
|
8,123,4,0.032520325203252036,0.2372288446747181,0.171124438884017,0.7915343915343915,1.016260162601626,1.0,0.1795993661331262,0.0,1,3,2
|
||||||
|
9,734,41,0.055858310626702996,1.472202321733401,0.39643833357458974,0.8284402664223979,2.310626702997275,1.0,3.280936828670794,0.0,1,17,16
|
||||||
|
10,993,7,0.007049345417925478,0.14975384152906807,0.07695825092529042,0.8712928851639762,1.0433031218529707,1.0,0.6755237166842796,0.0,1,14,13
|
||||||
|
11,249,8,0.0321285140562249,0.41116030451725305,0.1977263107791457,0.8041654723607728,1.3895582329317269,1.0,3.5978646341022595,0.0,1,41,40
|
||||||
|
12,930,99,0.1064516129032258,2.251679166978565,0.49001532939616665,0.8394325303461457,3.6268817204301076,3.0,3.4290862065303043,2.0,1,21,20
|
||||||
|
13,876,198,0.22602739726027396,3.9101740989981857,0.7394055701617327,0.5841396079985519,15.501141552511415,13.0,9.597433474135945,11.0,1,36,35
|
||||||
|
14,1065,6,0.005633802816901409,0.6717715558650733,0.3749228439431623,0.848458035826457,1.267605633802817,1.0,0.5852108438838486,1.0,1,14,13
|
||||||
|
15,747,13,0.01740294511378849,0.8945290630874894,0.3487511597393803,0.8168500968741627,2.0174029451137887,1.0,1.7641278118940684,3.0,1,13,12
|
||||||
|
16,135,8,0.05925925925925926,0.8626190356587518,0.4148320683071004,0.9006253315127155,1.9851851851851852,2.0,1.4141359627921224,0.5,1,13,12
|
||||||
|
@@ -0,0 +1,13 @@
|
|||||||
|
cleavage_position,total_fragments,unique_fragments,normalized_unique_ratio,shannon_entropy,normalized_shannon_entropy,mean_pairwise_tanimoto_distance,mean_atom_count,median_atom_count,std_atom_count,iqr_atom_count,min_atom_count,max_atom_count,atom_count_range
|
||||||
|
3,269,117,0.4349442379182156,4.069106586040983,0.8544640833690577,0.7649661414788775,15.405204460966543,14.0,7.356263980699357,10.0,4,39,35
|
||||||
|
4,269,63,0.2342007434944238,2.424572263644206,0.5852023706107886,0.7016204220824205,13.37546468401487,14.0,1.7682432704870303,0.0,5,18,13
|
||||||
|
6,89,60,0.6741573033707865,3.984314260747859,0.9731262721536741,0.7800916405422387,12.584269662921349,13.0,2.7016806619569276,3.0,5,20,15
|
||||||
|
7,4,1,0.25,-0.0,0.0,0.0,10.0,10.0,0.0,0.0,10,10,0
|
||||||
|
9,141,37,0.2624113475177305,2.633283400210265,0.7292559576027439,0.8047085590557639,7.815602836879433,5.0,4.30339275172448,7.0,4,17,13
|
||||||
|
10,4,4,1.0,1.3862943611198906,1.0,0.7794154494906375,11.5,11.5,1.8027756377319946,2.0,9,14,5
|
||||||
|
11,7,5,0.7142857142857143,1.5498260458782016,0.9629610647945144,0.6952446898083474,14.857142857142858,4.0,16.54801301346713,19.5,4,41,37
|
||||||
|
12,177,83,0.4689265536723164,3.913840989343892,0.8857167154747141,0.8257691691480503,10.0,10.0,2.788191402941166,4.0,4,21,17
|
||||||
|
13,709,193,0.27221438645980256,4.016208118657202,0.7631473589542491,0.5651618154111845,18.90691114245416,15.0,7.276898560461122,13.0,4,36,32
|
||||||
|
14,1,1,1.0,-0.0,0.0,0.0,14.0,14.0,0.0,0.0,14,14,0
|
||||||
|
15,205,8,0.03902439024390244,0.9473016276482624,0.45555578681139725,0.7692219308030589,4.692682926829268,5.0,1.2049471689465932,1.0,4,13,9
|
||||||
|
16,5,5,1.0,1.6094379124341005,1.0000000000000002,0.8910101403362273,8.4,7.0,2.4979991993593593,2.0,6,13,7
|
||||||
|
@@ -0,0 +1,15 @@
|
|||||||
|
cleavage_position,total_fragments_gt3,cyclic_fragment_rows,acyclic_fragment_rows,cyclic_row_fraction,acyclic_row_fraction,unique_fragments_gt3,cyclic_unique_fragments,acyclic_unique_fragments
|
||||||
|
3,269,231,38,0.8587360594795539,0.1412639405204461,117,94,23
|
||||||
|
4,269,263,6,0.9776951672862454,0.022304832713754646,63,57,6
|
||||||
|
5,0,0,0,0.0,0.0,0,0,0
|
||||||
|
6,89,82,7,0.9213483146067416,0.07865168539325842,60,54,6
|
||||||
|
7,4,0,4,0.0,1.0,1,0,1
|
||||||
|
8,0,0,0,0.0,0.0,0,0,0
|
||||||
|
9,141,61,80,0.4326241134751773,0.5673758865248227,37,29,8
|
||||||
|
10,4,4,0,1.0,0.0,4,4,0
|
||||||
|
11,7,0,7,0.0,1.0,5,0,5
|
||||||
|
12,177,116,61,0.655367231638418,0.3446327683615819,83,38,45
|
||||||
|
13,709,689,20,0.9717912552891397,0.028208744710860368,193,187,6
|
||||||
|
14,1,1,0,1.0,0.0,1,1,0
|
||||||
|
15,205,4,201,0.01951219512195122,0.9804878048780488,8,3,5
|
||||||
|
16,5,4,1,0.8,0.2,5,4,1
|
||||||
|
6280
validation_output/fragment_library_filter_gt3.csv
Normal file
6280
validation_output/fragment_library_filter_gt3.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
validation_output/fragments.db
Normal file
BIN
validation_output/fragments.db
Normal file
Binary file not shown.
11037
validation_output/summary.csv
Normal file
11037
validation_output/summary.csv
Normal file
File diff suppressed because it is too large
Load Diff
23
validation_output/summary_statistics.json
Normal file
23
validation_output/summary_statistics.json
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
{
|
||||||
|
"total_molecules": 11036,
|
||||||
|
"by_classification": {
|
||||||
|
"non_standard_macrocycle": 6336,
|
||||||
|
"standard_macrolactone": 4482,
|
||||||
|
"not_macrolactone": 218
|
||||||
|
},
|
||||||
|
"by_ring_size": {
|
||||||
|
"14.0": 3017,
|
||||||
|
"16.0": 1879,
|
||||||
|
"15.0": 1613,
|
||||||
|
"12.0": 1419,
|
||||||
|
"19.0": 855,
|
||||||
|
"18.0": 809,
|
||||||
|
"13.0": 679,
|
||||||
|
"20.0": 243,
|
||||||
|
"17.0": 196
|
||||||
|
},
|
||||||
|
"by_status": {
|
||||||
|
"skipped": 6554,
|
||||||
|
"success": 4482
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user