1092 lines
54 KiB
Python
1092 lines
54 KiB
Python
from __future__ import annotations
|
||
|
||
import argparse
|
||
from math import ceil
|
||
from pathlib import Path
|
||
import sqlite3
|
||
|
||
import matplotlib
|
||
|
||
matplotlib.use("Agg")
|
||
|
||
import matplotlib.pyplot as plt
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
from rdkit import Chem
|
||
from rdkit.Chem import rdMolDescriptors
|
||
|
||
from macro_lactone_toolkit.validation.fragment_library_analysis import (
|
||
build_filter_candidate_table,
|
||
build_fragment_atom_count_frequency_table,
|
||
build_fragment_atom_count_summary,
|
||
build_position_diversity_table,
|
||
load_fragment_library_dataset,
|
||
)
|
||
|
||
|
||
def build_parser() -> argparse.ArgumentParser:
|
||
parser = argparse.ArgumentParser(
|
||
description="Analyze validation_output fragment_library.csv and generate atom-count and diversity reports."
|
||
)
|
||
parser.add_argument("--input", required=True, help="Path to validation_output/fragment_library.csv")
|
||
parser.add_argument("--db", required=True, help="Path to validation_output/fragments.db")
|
||
parser.add_argument("--output-dir", required=True, help="Directory to write plots and CSV summaries")
|
||
parser.add_argument("--ring-size", type=int, default=16, help="Ring size to analyze position-wise diversity for")
|
||
parser.add_argument(
|
||
"--design-min-atoms",
|
||
type=int,
|
||
default=4,
|
||
help="Minimum heavy-atom count to retain for design-oriented side-chain analysis",
|
||
)
|
||
parser.add_argument(
|
||
"--medchem-positions",
|
||
default="6,7,15,16",
|
||
help="Comma-separated cleavage positions to highlight as literature or medicinal-chemistry hotspots",
|
||
)
|
||
return parser
|
||
|
||
|
||
def plot_fragment_atom_count_distribution(dataframe: pd.DataFrame, output_path: Path) -> None:
|
||
figure, axes = plt.subplots(1, 2, figsize=(14, 5))
|
||
|
||
sns.histplot(data=dataframe, x="fragment_atom_count", discrete=True, ax=axes[0], color="#3b82f6")
|
||
axes[0].set_title("Fragment Atom Count Distribution")
|
||
axes[0].set_xlabel("Fragment atom count")
|
||
axes[0].set_ylabel("Fragment rows")
|
||
|
||
sns.ecdfplot(data=dataframe, x="fragment_atom_count", ax=axes[1], color="#ef4444")
|
||
axes[1].set_title("Fragment Atom Count ECDF")
|
||
axes[1].set_xlabel("Fragment atom count")
|
||
axes[1].set_ylabel("Cumulative fraction")
|
||
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def plot_ring_position_atom_count_distribution(dataframe: pd.DataFrame, ring_size: int, output_path: Path) -> None:
|
||
positions = sorted(dataframe["cleavage_position"].unique().tolist())
|
||
ncols = 4
|
||
nrows = ceil(len(positions) / ncols)
|
||
figure, axes = plt.subplots(nrows, ncols, figsize=(16, 3.8 * nrows), sharex=True, sharey=True)
|
||
axes_flat = axes.flatten()
|
||
|
||
for axis, position in zip(axes_flat, positions, strict=False):
|
||
subset = dataframe[dataframe["cleavage_position"] == position]
|
||
sns.histplot(data=subset, x="fragment_atom_count", discrete=True, ax=axis, color="#10b981")
|
||
axis.set_title(f"Position {position} (n={len(subset)})")
|
||
axis.set_xlabel("Atom count")
|
||
axis.set_ylabel("Count")
|
||
|
||
for axis in axes_flat[len(positions) :]:
|
||
axis.axis("off")
|
||
|
||
figure.suptitle(f"{ring_size}-Membered Ring: Atom Count Distribution by Cleavage Position", y=1.02)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def plot_ring_position_diversity(
|
||
diversity_table: pd.DataFrame,
|
||
ring_size: int,
|
||
output_path: Path,
|
||
title_suffix: str = "Side-Chain Diversity by Position",
|
||
) -> None:
|
||
if diversity_table.empty:
|
||
figure, axis = plt.subplots(figsize=(10, 4))
|
||
axis.text(0.5, 0.5, "No fragments available after filtering.", ha="center", va="center")
|
||
axis.set_axis_off()
|
||
figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=0.98)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
return
|
||
|
||
figure, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)
|
||
|
||
sns.barplot(data=diversity_table, x="cleavage_position", y="unique_fragments", ax=axes[0], color="#2563eb")
|
||
axes[0].set_title("Unique Fragment Types by Position")
|
||
axes[0].set_ylabel("Unique fragments")
|
||
|
||
sns.barplot(
|
||
data=diversity_table,
|
||
x="cleavage_position",
|
||
y="normalized_shannon_entropy",
|
||
ax=axes[1],
|
||
color="#f59e0b",
|
||
)
|
||
axes[1].set_title("Normalized Shannon Entropy by Position")
|
||
axes[1].set_ylabel("Normalized entropy")
|
||
|
||
sns.barplot(
|
||
data=diversity_table,
|
||
x="cleavage_position",
|
||
y="mean_pairwise_tanimoto_distance",
|
||
ax=axes[2],
|
||
color="#7c3aed",
|
||
)
|
||
axes[2].set_title("Mean Pairwise Tanimoto Distance by Position")
|
||
axes[2].set_ylabel("Mean distance")
|
||
axes[2].set_xlabel("Cleavage position")
|
||
|
||
figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=1.01)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def build_position_count_comparison_table(all_dataframe: pd.DataFrame, filtered_dataframe: pd.DataFrame) -> pd.DataFrame:
|
||
all_stats = (
|
||
all_dataframe.groupby("cleavage_position")
|
||
.agg(
|
||
total_fragments_all=("fragment_smiles_plain", "size"),
|
||
unique_fragments_all=("fragment_smiles_plain", "nunique"),
|
||
mean_atom_count_all=("fragment_atom_count", "mean"),
|
||
)
|
||
.reset_index()
|
||
)
|
||
filtered_stats = (
|
||
filtered_dataframe.groupby("cleavage_position")
|
||
.agg(
|
||
total_fragments_gt3=("fragment_smiles_plain", "size"),
|
||
unique_fragments_gt3=("fragment_smiles_plain", "nunique"),
|
||
mean_atom_count_gt3=("fragment_atom_count", "mean"),
|
||
)
|
||
.reset_index()
|
||
)
|
||
|
||
merged = all_stats.merge(filtered_stats, on="cleavage_position", how="left")
|
||
integer_columns = [
|
||
"total_fragments_gt3",
|
||
"unique_fragments_gt3",
|
||
]
|
||
for column in integer_columns:
|
||
merged[column] = merged[column].fillna(0).astype(int)
|
||
merged["mean_atom_count_gt3"] = merged["mean_atom_count_gt3"].fillna(0.0)
|
||
merged["gt3_row_fraction"] = merged["total_fragments_gt3"] / merged["total_fragments_all"]
|
||
merged["gt3_unique_fraction"] = merged["unique_fragments_gt3"] / merged["unique_fragments_all"]
|
||
return merged.sort_values("cleavage_position").reset_index(drop=True)
|
||
|
||
|
||
def annotate_fragment_ring_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
|
||
result = dataframe.copy()
|
||
if result.empty:
|
||
result["fragment_ring_count"] = pd.Series(dtype=int)
|
||
return result
|
||
result["fragment_ring_count"] = result["fragment_smiles_plain"].apply(
|
||
lambda smiles: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(smiles))
|
||
)
|
||
return result
|
||
|
||
|
||
def build_ring_sensitivity_table(filtered_dataframe: pd.DataFrame, position_counts: pd.DataFrame) -> pd.DataFrame:
|
||
rows: list[dict[str, float | int]] = []
|
||
annotated = annotate_fragment_ring_counts(filtered_dataframe)
|
||
|
||
for position in position_counts["cleavage_position"]:
|
||
subset = annotated[annotated["cleavage_position"] == position].copy()
|
||
if subset.empty:
|
||
rows.append(
|
||
{
|
||
"cleavage_position": int(position),
|
||
"total_fragments_gt3": 0,
|
||
"cyclic_fragment_rows": 0,
|
||
"acyclic_fragment_rows": 0,
|
||
"cyclic_row_fraction": 0.0,
|
||
"acyclic_row_fraction": 0.0,
|
||
"unique_fragments_gt3": 0,
|
||
"cyclic_unique_fragments": 0,
|
||
"acyclic_unique_fragments": 0,
|
||
}
|
||
)
|
||
continue
|
||
|
||
unique = subset[["fragment_smiles_plain", "fragment_ring_count"]].drop_duplicates()
|
||
cyclic_rows = int((subset["fragment_ring_count"] > 0).sum())
|
||
acyclic_rows = int((subset["fragment_ring_count"] == 0).sum())
|
||
rows.append(
|
||
{
|
||
"cleavage_position": int(position),
|
||
"total_fragments_gt3": int(len(subset)),
|
||
"cyclic_fragment_rows": cyclic_rows,
|
||
"acyclic_fragment_rows": acyclic_rows,
|
||
"cyclic_row_fraction": cyclic_rows / len(subset),
|
||
"acyclic_row_fraction": acyclic_rows / len(subset),
|
||
"unique_fragments_gt3": int(subset["fragment_smiles_plain"].nunique()),
|
||
"cyclic_unique_fragments": int((unique["fragment_ring_count"] > 0).sum()),
|
||
"acyclic_unique_fragments": int((unique["fragment_ring_count"] == 0).sum()),
|
||
}
|
||
)
|
||
|
||
return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)
|
||
|
||
|
||
def build_medchem_hotspot_table(
|
||
medchem_positions: list[int],
|
||
position_counts: pd.DataFrame,
|
||
diversity_all: pd.DataFrame,
|
||
diversity_gt3: pd.DataFrame,
|
||
) -> pd.DataFrame:
|
||
frame = pd.DataFrame({"cleavage_position": medchem_positions})
|
||
merged = frame.merge(position_counts, on="cleavage_position", how="left")
|
||
merged = merged.merge(
|
||
diversity_all[
|
||
[
|
||
"cleavage_position",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
]
|
||
].rename(
|
||
columns={
|
||
"normalized_shannon_entropy": "normalized_shannon_entropy_all",
|
||
"mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_all",
|
||
}
|
||
),
|
||
on="cleavage_position",
|
||
how="left",
|
||
)
|
||
merged = merged.merge(
|
||
diversity_gt3[
|
||
[
|
||
"cleavage_position",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
]
|
||
].rename(
|
||
columns={
|
||
"normalized_shannon_entropy": "normalized_shannon_entropy_gt3",
|
||
"mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_gt3",
|
||
}
|
||
),
|
||
on="cleavage_position",
|
||
how="left",
|
||
)
|
||
numeric_fill_zero = [
|
||
"total_fragments_all",
|
||
"unique_fragments_all",
|
||
"mean_atom_count_all",
|
||
"total_fragments_gt3",
|
||
"unique_fragments_gt3",
|
||
"mean_atom_count_gt3",
|
||
"gt3_row_fraction",
|
||
"gt3_unique_fraction",
|
||
"normalized_shannon_entropy_all",
|
||
"mean_pairwise_tanimoto_distance_all",
|
||
"normalized_shannon_entropy_gt3",
|
||
"mean_pairwise_tanimoto_distance_gt3",
|
||
]
|
||
for column in numeric_fill_zero:
|
||
merged[column] = merged[column].fillna(0.0)
|
||
|
||
integer_columns = [
|
||
"total_fragments_all",
|
||
"unique_fragments_all",
|
||
"total_fragments_gt3",
|
||
"unique_fragments_gt3",
|
||
]
|
||
for column in integer_columns:
|
||
merged[column] = merged[column].astype(int)
|
||
|
||
return merged.sort_values("cleavage_position").reset_index(drop=True)
|
||
|
||
|
||
def plot_ring_position_count_comparison(
|
||
comparison_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
output_path: Path,
|
||
) -> None:
|
||
figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
|
||
|
||
melted = comparison_table.melt(
|
||
id_vars="cleavage_position",
|
||
value_vars=["total_fragments_all", "total_fragments_gt3"],
|
||
var_name="series",
|
||
value_name="count",
|
||
)
|
||
melted["series"] = melted["series"].map(
|
||
{
|
||
"total_fragments_all": "All splice-ready fragments",
|
||
"total_fragments_gt3": f"Fragments with >={design_min_atoms} heavy atoms",
|
||
}
|
||
)
|
||
sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
|
||
axes[0].set_title("Fragment Counts by Position")
|
||
axes[0].set_ylabel("Fragment rows")
|
||
axes[0].set_xlabel("")
|
||
axes[0].legend(loc="upper right")
|
||
|
||
sns.barplot(data=comparison_table, x="cleavage_position", y="gt3_row_fraction", ax=axes[1], color="#f59e0b")
|
||
axes[1].set_title(f"Fraction of Position-Specific Fragments with >={design_min_atoms} Heavy Atoms")
|
||
axes[1].set_ylabel("Fraction")
|
||
axes[1].set_xlabel("Cleavage position")
|
||
axes[1].set_ylim(0, 1.0)
|
||
|
||
figure.suptitle(f"{ring_size}-Membered Ring: Position Counts Before and After Size Filtering", y=1.01)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def plot_ring_position_atom_count_boxplot(
|
||
filtered_dataframe: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
output_path: Path,
|
||
) -> None:
|
||
if filtered_dataframe.empty:
|
||
figure, axis = plt.subplots(figsize=(10, 4))
|
||
axis.text(0.5, 0.5, "No fragments satisfy the size filter.", ha="center", va="center")
|
||
axis.set_axis_off()
|
||
figure.suptitle(
|
||
f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms",
|
||
y=0.98,
|
||
)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
return
|
||
|
||
figure, axis = plt.subplots(figsize=(14, 6))
|
||
sns.boxplot(data=filtered_dataframe, x="cleavage_position", y="fragment_atom_count", ax=axis, color="#10b981")
|
||
axis.set_title(f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms")
|
||
axis.set_xlabel("Cleavage position")
|
||
axis.set_ylabel("Fragment atom count")
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def plot_medchem_hotspot_comparison(
|
||
hotspot_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
output_path: Path,
|
||
) -> None:
|
||
figure, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True)
|
||
positions = hotspot_table["cleavage_position"].astype(str)
|
||
|
||
sns.barplot(x=positions, y=hotspot_table["total_fragments_all"], ax=axes[0, 0], color="#2563eb")
|
||
axes[0, 0].set_title("All splice-ready fragments")
|
||
axes[0, 0].set_ylabel("Rows")
|
||
axes[0, 0].set_xlabel("")
|
||
|
||
sns.barplot(x=positions, y=hotspot_table["total_fragments_gt3"], ax=axes[0, 1], color="#10b981")
|
||
axes[0, 1].set_title(f"Fragments with >={design_min_atoms} heavy atoms")
|
||
axes[0, 1].set_ylabel("Rows")
|
||
axes[0, 1].set_xlabel("")
|
||
|
||
sns.barplot(x=positions, y=hotspot_table["unique_fragments_gt3"], ax=axes[1, 0], color="#f59e0b")
|
||
axes[1, 0].set_title(f"Unique fragments with >={design_min_atoms} heavy atoms")
|
||
axes[1, 0].set_ylabel("Unique SMILES")
|
||
axes[1, 0].set_xlabel("Cleavage position")
|
||
|
||
sns.barplot(x=positions, y=hotspot_table["normalized_shannon_entropy_gt3"], ax=axes[1, 1], color="#7c3aed")
|
||
axes[1, 1].set_title("Normalized Shannon entropy after size filtering")
|
||
axes[1, 1].set_ylabel("Entropy")
|
||
axes[1, 1].set_xlabel("Cleavage position")
|
||
axes[1, 1].set_ylim(0, 1.0)
|
||
|
||
figure.suptitle(f"{ring_size}-Membered Ring: Medicinal-Chemistry Hotspot Comparison", y=1.01)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def plot_ring_sensitivity(
|
||
sensitivity_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
output_path: Path,
|
||
) -> None:
|
||
figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
|
||
|
||
melted = sensitivity_table.melt(
|
||
id_vars="cleavage_position",
|
||
value_vars=["cyclic_fragment_rows", "acyclic_fragment_rows"],
|
||
var_name="series",
|
||
value_name="count",
|
||
)
|
||
melted["series"] = melted["series"].map(
|
||
{
|
||
"cyclic_fragment_rows": "Ring-bearing side chains",
|
||
"acyclic_fragment_rows": "Acyclic side chains",
|
||
}
|
||
)
|
||
sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
|
||
axes[0].set_title(f"Fragments with >={design_min_atoms} Heavy Atoms: Cyclic vs Acyclic Side Chains")
|
||
axes[0].set_ylabel("Fragment rows")
|
||
axes[0].set_xlabel("")
|
||
axes[0].legend(loc="upper right")
|
||
|
||
sns.barplot(data=sensitivity_table, x="cleavage_position", y="acyclic_row_fraction", ax=axes[1], color="#f59e0b")
|
||
axes[1].set_title("Acyclic Fraction After Size Filtering")
|
||
axes[1].set_ylabel("Acyclic fraction")
|
||
axes[1].set_xlabel("Cleavage position")
|
||
axes[1].set_ylim(0, 1.0)
|
||
|
||
figure.suptitle(f"{ring_size}-Membered Ring: Sensitivity to Ring-Bearing Side Chains", y=1.01)
|
||
figure.tight_layout()
|
||
figure.savefig(output_path, dpi=300, bbox_inches="tight")
|
||
plt.close(figure)
|
||
|
||
|
||
def format_top_positions(table: pd.DataFrame, sort_column: str, limit: int = 5) -> str:
|
||
if table.empty:
|
||
return "No positions available."
|
||
subset = table.sort_values(sort_column, ascending=False).head(limit)
|
||
return subset.to_string(index=False)
|
||
|
||
|
||
def mirror_ring_position(position: int, ring_size: int) -> int:
|
||
if position <= 2:
|
||
return position
|
||
return ring_size - position + 3
|
||
|
||
|
||
def format_position_mapping(positions: list[int], ring_size: int) -> str:
|
||
return ", ".join(f"{position} → {mirror_ring_position(position, ring_size)}" for position in positions)
|
||
|
||
|
||
def build_zero_fragment_parent_table(db_path: str | Path, ring_size: int) -> pd.DataFrame:
|
||
with sqlite3.connect(db_path) as connection:
|
||
tables = {
|
||
row[0]
|
||
for row in connection.execute(
|
||
"SELECT name FROM sqlite_master WHERE type='table'"
|
||
)
|
||
}
|
||
if "fragment_library_entries" in tables:
|
||
query = """
|
||
SELECT ml_id, num_sidechains, cleavage_positions
|
||
FROM parent_molecules
|
||
WHERE classification = 'standard_macrolactone'
|
||
AND ring_size = ?
|
||
AND processing_status = 'success'
|
||
AND NOT EXISTS (
|
||
SELECT 1
|
||
FROM fragment_library_entries fle
|
||
WHERE fle.source_parent_ml_id = parent_molecules.ml_id
|
||
AND fle.source_type = 'validation_extract'
|
||
AND fle.splice_ready = 1
|
||
)
|
||
ORDER BY ml_id
|
||
"""
|
||
else:
|
||
# The lightweight test database only includes parent_molecules.
|
||
query = """
|
||
SELECT ml_id, num_sidechains, cleavage_positions
|
||
FROM parent_molecules
|
||
WHERE classification = 'standard_macrolactone'
|
||
AND ring_size = ?
|
||
AND processing_status = 'success'
|
||
AND num_sidechains = 0
|
||
ORDER BY ml_id
|
||
"""
|
||
return pd.read_sql_query(
|
||
query,
|
||
connection,
|
||
params=[ring_size],
|
||
)
|
||
|
||
|
||
def build_markdown_report(
|
||
output_dir: Path,
|
||
analysis_df: pd.DataFrame,
|
||
ring_df: pd.DataFrame,
|
||
filtered_ring_df: pd.DataFrame,
|
||
filter_candidates: pd.DataFrame,
|
||
diversity_all: pd.DataFrame,
|
||
diversity_gt3: pd.DataFrame,
|
||
position_counts: pd.DataFrame,
|
||
ring_sensitivity_table: pd.DataFrame,
|
||
hotspot_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
) -> str:
|
||
atom_counts = analysis_df["fragment_atom_count"]
|
||
annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
|
||
acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
|
||
acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
|
||
conservative_filter = filter_candidates.loc[
|
||
filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
|
||
].iloc[0]
|
||
design_filter = filter_candidates.loc[
|
||
filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
|
||
].iloc[0]
|
||
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
|
||
top_filtered_counts = position_counts[
|
||
["cleavage_position", "total_fragments_gt3", "gt3_row_fraction"]
|
||
].sort_values("total_fragments_gt3", ascending=False).head(6)
|
||
|
||
lines = [
|
||
"# Fragment Library Analysis Report",
|
||
"",
|
||
"## Scope and Dataset",
|
||
"",
|
||
f"- Input fragment library: `{analysis_df['source_type'].mode().iloc[0]}` rows merged with `parent_molecules` metadata.",
|
||
f"- Current validated design library contains **{len(analysis_df):,}** splice-ready fragment rows from **{analysis_df['source_parent_ml_id'].nunique():,}** parent macrolactones.",
|
||
f"- The {ring_size}-membered subset contains **{len(ring_df):,}** fragment rows from **{ring_df['source_parent_ml_id'].nunique():,}** parent molecules.",
|
||
f"- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.",
|
||
"",
|
||
"## Figure 1. Global Atom-Count Distribution",
|
||
"",
|
||
"",
|
||
"",
|
||
f"- Heavy-atom count is extremely right-skewed: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, median={atom_counts.quantile(0.50):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}.",
|
||
f"- A conservative cleanup filter of `<= {design_min_atoms - 2}` heavy atoms removes **{int(conservative_filter.removed_rows):,}** rows ({conservative_filter.removed_row_fraction:.1%}) but only **{int(conservative_filter.removed_unique_fragments):,}** unique fragment SMILES ({conservative_filter.removed_unique_fraction:.1%}).",
|
||
f"- A design-oriented filter aligned with your previous analysis, `<= {design_min_atoms - 1}` heavy atoms, removes **{int(design_filter.removed_rows):,}** rows ({design_filter.removed_row_fraction:.1%}) and **{int(design_filter.removed_unique_fragments):,}** unique fragment SMILES ({design_filter.removed_unique_fraction:.1%}).",
|
||
f"- Interpretation: `<= {design_min_atoms - 2}` is the safer default for library cleanup, while `> {design_min_atoms - 1}` is useful for positional diversity analysis because it suppresses one- and two-atom noise.",
|
||
"",
|
||
"## Figure 2. Ring-Specific Counts Before and After Size Filtering",
|
||
"",
|
||
f"",
|
||
"",
|
||
f"- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= {design_min_atoms} heavy atoms**.",
|
||
f"- After size filtering, the most populated {ring_size}-membered positions are:",
|
||
"",
|
||
"```text",
|
||
top_filtered_counts.to_string(index=False),
|
||
"```",
|
||
"",
|
||
"## Figure 3. Atom-Count Spread for Design-Relevant Fragments",
|
||
"",
|
||
f"",
|
||
"",
|
||
f"- This boxplot only includes fragments with **>= {design_min_atoms} heavy atoms**.",
|
||
f"- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.",
|
||
"",
|
||
"## Figure 4. Position-Wise Diversity Metrics After Filtering",
|
||
"",
|
||
f"",
|
||
"",
|
||
"- Diversity is evaluated with three complementary metrics:",
|
||
" 1. `unique_fragments`: raw chemotype count.",
|
||
" 2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.",
|
||
" 3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.",
|
||
"",
|
||
"Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):",
|
||
"",
|
||
"```text",
|
||
format_top_positions(
|
||
robust_gt3[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments",
|
||
"unique_fragments",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
"mean_atom_count",
|
||
]
|
||
],
|
||
"normalized_shannon_entropy",
|
||
limit=8,
|
||
),
|
||
"```",
|
||
"",
|
||
f"- Within the filtered {ring_size}-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 6, 'total_fragments_gt3'].iloc[0])}** rows) but high chemotype diversity and near-maximal entropy.",
|
||
f"- **Position 15** is also clearly relevant because it retains **{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 15, 'total_fragments_gt3'].iloc[0])}** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.",
|
||
"",
|
||
"## Figure 5. Focus on Medicinal-Chemistry Hotspots",
|
||
"",
|
||
f"",
|
||
"",
|
||
"This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.",
|
||
"",
|
||
"```text",
|
||
hotspot_table[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments_all",
|
||
"total_fragments_gt3",
|
||
"unique_fragments_gt3",
|
||
"normalized_shannon_entropy_gt3",
|
||
"mean_pairwise_tanimoto_distance_gt3",
|
||
]
|
||
].to_string(index=False),
|
||
"```",
|
||
"",
|
||
"- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.",
|
||
"- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.",
|
||
"- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.",
|
||
"- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.",
|
||
"",
|
||
"## Numbering Alignment With Medicinal-Chemistry Labels",
|
||
"",
|
||
"- The codebase uses one canonical numbering rule: position 1 is the lactone carbonyl carbon, position 2 is the ester oxygen, and positions 3..N follow the unique ring traversal that starts from position 2 in `build_numbering_result()`.",
|
||
"- If a medicinal-chemistry scheme keeps positions 1 and 2 fixed but numbers the rest of the ring in the mirrored direction, then the conversion for positions >=3 is `p_mirror = ring_size - p + 3`.",
|
||
f"- For a {ring_size}-membered ring, literature labels `{','.join(str(position) for position in hotspot_table['cleavage_position'].tolist())}` map to current-code labels `{format_position_mapping(hotspot_table['cleavage_position'].tolist(), ring_size)}`.",
|
||
f"- Conversely, the current-code natural-diversity hotspots `13, 3, 4, 12` correspond to mirrored medicinal-chemistry labels `6, 16, 15, 7` in a {ring_size}-membered ring.",
|
||
"- This means the apparent disagreement was a numbering-direction mismatch, not a chemical contradiction between the database analysis and the literature-guided hotspot list.",
|
||
"- Practical rule: keep the database and cleavage-position statistics in the current canonical code numbering, but add mirrored medicinal-chemistry labels in figures, tables and manuscripts whenever you compare against literature.",
|
||
"",
|
||
"## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?",
|
||
"",
|
||
f"",
|
||
"",
|
||
f"- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).",
|
||
"- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.",
|
||
"",
|
||
"```text",
|
||
ring_sensitivity_table[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments_gt3",
|
||
"cyclic_fragment_rows",
|
||
"acyclic_fragment_rows",
|
||
"acyclic_row_fraction",
|
||
"cyclic_unique_fragments",
|
||
"acyclic_unique_fragments",
|
||
]
|
||
].to_string(index=False),
|
||
"```",
|
||
"",
|
||
"- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.",
|
||
"- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.",
|
||
"- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.",
|
||
"",
|
||
"Acyclic-only ranking after removing <=3-heavy-atom fragments:",
|
||
"",
|
||
"```text",
|
||
format_top_positions(
|
||
acyclic_diversity[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments",
|
||
"unique_fragments",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
"mean_atom_count",
|
||
]
|
||
].sort_values("total_fragments", ascending=False),
|
||
"total_fragments",
|
||
limit=8,
|
||
),
|
||
"```",
|
||
"",
|
||
"- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.",
|
||
"",
|
||
"## Reconciliation With the Previous Conclusion",
|
||
"",
|
||
"The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.",
|
||
"",
|
||
"- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.",
|
||
"- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.",
|
||
"- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.",
|
||
"",
|
||
"### Recommended paper-safe wording",
|
||
"",
|
||
f"> In the validated MacrolactoneDB fragment library, natural side-chain diversity of {ring_size}-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.",
|
||
f"> If medicinal-chemistry labels are reported in the mirrored direction, those natural-diversity hotspots correspond to literature labels 6, 16, 15 and 7, while literature hotspot labels 6, 7, 15 and 16 correspond to current-code positions 13, 12, 4 and 3.",
|
||
"",
|
||
"### Practical interpretation for fragment-library design",
|
||
"",
|
||
f"- Use `<= {design_min_atoms - 2}` heavy atoms as the **default cleanup filter** for the production fragment library.",
|
||
f"- Use the stricter `> {design_min_atoms - 1}` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.",
|
||
f"- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.",
|
||
"- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.",
|
||
"- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.",
|
||
"- When comparing to literature numbering, either rerun the hotspot panel with mirrored positions or label every reported position as `code_position (medchem_position)` to avoid directional ambiguity.",
|
||
"",
|
||
]
|
||
return "\n".join(lines)
|
||
|
||
|
||
def build_markdown_report_zh(
|
||
analysis_df: pd.DataFrame,
|
||
ring_df: pd.DataFrame,
|
||
filtered_ring_df: pd.DataFrame,
|
||
filter_candidates: pd.DataFrame,
|
||
standard_success_parent_count: int,
|
||
zero_fragment_ring_parents: pd.DataFrame,
|
||
diversity_gt3: pd.DataFrame,
|
||
position_counts: pd.DataFrame,
|
||
ring_sensitivity_table: pd.DataFrame,
|
||
hotspot_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
) -> str:
|
||
conservative_filter = filter_candidates.loc[
|
||
filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
|
||
].iloc[0]
|
||
design_filter = filter_candidates.loc[
|
||
filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
|
||
].iloc[0]
|
||
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
|
||
annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
|
||
acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
|
||
acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
|
||
|
||
paper_ready_paragraph = (
|
||
f"在当前验证后的 MacrolactoneDB 片段库中,桥连或双锚点侧链不会进入当前片段库,因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。"
|
||
f"本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链;然而,许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains(带环单锚点侧链),"
|
||
f"这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反,当仅保留 >3 个重原子且进一步限定为非环状侧链后,位点排序转而更偏向 15、9、12 和 3 位,"
|
||
f"说明 13、3、4、12 的优势主要反映了天然带环侧链的富集,而 15 位则更接近药物化学中常见的非环状酰基修饰位点。"
|
||
f"因此,在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。"
|
||
)
|
||
|
||
lines = [
|
||
"# 大环内酯碎片库分析报告(中文)",
|
||
"",
|
||
"## 数据范围",
|
||
"",
|
||
f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录,来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||
f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录,来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
|
||
f"- 这里的“可拼接碎片”指验证库中 `source_type='validation_extract'` 且 `splice_ready=1` 的单锚点侧链片段。桥环、稠环或任何具有多个环连接点的侧链都已经在生成阶段被排除,不会进入这份库。",
|
||
f"- 16 元环子集是按母体元数据里的 `ring_size=16` 直接过滤出来的,不是从片段反推 ring size。",
|
||
f"- 用于设计相关位点分析的严格子集定义为:片段重原子数 **>= {design_min_atoms}**。",
|
||
"",
|
||
"## 16 元环母体计数口径说明",
|
||
"",
|
||
f"- 在数据库里,`standard_macrolactone + ring_size={ring_size} + processing_status=success` 一共有 **{standard_success_parent_count:,}** 个母体。",
|
||
f"- 其中 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体至少产出过 1 条可拼接片段,所以进入了当前报告的 16 元环片段统计。",
|
||
f"- 剩余 **{len(zero_fragment_ring_parents):,}** 个母体没有任何可拼接片段;它们的共同特征是 `num_sidechains=0`、`cleavage_positions=[]`。",
|
||
"",
|
||
"```text",
|
||
zero_fragment_ring_parents.to_string(index=False) if not zero_fragment_ring_parents.empty else "No zero-fragment parents.",
|
||
"```",
|
||
"",
|
||
"## 全库碎片大小结论",
|
||
"",
|
||
f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录({conservative_filter.removed_row_fraction:.1%}),但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段({conservative_filter.removed_unique_fraction:.1%})。",
|
||
f"- 若用于和你之前的分析口径对齐,则建议采用 `> {design_min_atoms - 1}` 重原子作为设计相关片段集合。此时会删除 **{int(design_filter.removed_rows):,}** 条记录({design_filter.removed_row_fraction:.1%})。",
|
||
"",
|
||
"## 16 元环位点结论",
|
||
"",
|
||
f"- 在 {ring_size} 元环中,保留所有可拼接片段时,天然侧链多样性较高的位置主要集中在 `13、3/4、12`,并且 6 位也显示出较强的设计相关多样性。",
|
||
f"- 在 `> {design_min_atoms - 1}` 重原子子集中,样本量足够且多样性较高的位点为:",
|
||
"",
|
||
"```text",
|
||
robust_gt3[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments",
|
||
"unique_fragments",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
"mean_atom_count",
|
||
]
|
||
]
|
||
.sort_values("normalized_shannon_entropy", ascending=False)
|
||
.head(8)
|
||
.to_string(index=False),
|
||
"```",
|
||
"",
|
||
"- 其中 6 位最能支持你的药化判断:它不是最高频位点,但在设计相关大侧链中显示出很高的结构多样性。",
|
||
"- 15 位则更偏向高频但低多样性的酰基修饰位点。",
|
||
"",
|
||
"## 编号校准说明(代码编号 vs 药化编号)",
|
||
"",
|
||
"- 当前代码和数据库采用统一编号:`1 = 内酯羰基碳`,`2 = 相邻酯氧`,`3..N` 则从 2 位出发沿环的唯一遍历顺序继续编号。",
|
||
"- 如果药化文献同样固定 1 和 2 位,但把 `3..N` 按相反方向编号,则对于 `p >= 3` 有镜像换算公式:`p_镜像 = ring_size - p + 3`。",
|
||
f"- 对于 {ring_size} 元环,你关心的药化位点 `{','.join(str(position) for position in hotspot_table['cleavage_position'].tolist())}`,在当前代码编号下对应为:`{format_position_mapping(hotspot_table['cleavage_position'].tolist(), ring_size)}`。",
|
||
f"- 反过来,当前代码编号下的天然多样性热点 `13、3、4、12`,在药化镜像编号下分别对应 `6、16、15、7`。",
|
||
"- 因此,之前看起来对不上的 `13、3、4、12` 与 `6、7、15、16`,本质上是同一组位点的方向镜像,不是化学结论冲突。",
|
||
"- 建议后续统一规则:数据库、断裂结果、拼接和模型训练一律使用当前代码编号;论文、图表和药化讨论中若需对照文献,再同时标注镜像药化编号。",
|
||
"",
|
||
"## 桥环 / 稠环干扰的敏感性分析",
|
||
"",
|
||
"桥连或双锚点侧链不会进入当前片段库,因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说,真正的 bridge / fused multi-anchor components 已被代码层面排除。上面那 6 个 16 元环母体并不是这类“被误收进来的桥环碎片”,而是根本没有任何可拼接外侧链,所以不会产生 fragment 行。",
|
||
"",
|
||
"但是,需要额外区分另一类情况:**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环,因此会被保留下来,但片段自身可能包含糖环、杂环或其他环状骨架,仍然会显著影响位点多样性排名。",
|
||
"",
|
||
"敏感性分析结果如下:",
|
||
"",
|
||
"```text",
|
||
ring_sensitivity_table[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments_gt3",
|
||
"cyclic_fragment_rows",
|
||
"acyclic_fragment_rows",
|
||
"acyclic_row_fraction",
|
||
"cyclic_unique_fragments",
|
||
"acyclic_unique_fragments",
|
||
]
|
||
].to_string(index=False),
|
||
"```",
|
||
"",
|
||
"- `13` 位:`689/709` 条大侧链片段本身带环,说明该位点的高多样性主要由天然糖基/环状侧链驱动。",
|
||
"- `4` 位:`263/269` 条大侧链片段带环,几乎同样完全由环状侧链主导。",
|
||
"- `3` 位:`231/269` 条大侧链片段带环,带环片段占主导。",
|
||
"- `12` 位:环状与非环状侧链并存,但环状侧链仍占多数。",
|
||
"- `6` 位:多数保留的大侧链也带环,但仍表现出较强的多样性。",
|
||
"- `15` 位:几乎全部是非环状侧链,因此更接近半合成药化修饰位点的特征。",
|
||
"",
|
||
"## 仅看非环状侧链时的位点排序",
|
||
"",
|
||
f"- 当只保留 `> {design_min_atoms - 1}` 重原子且 **不含环** 的侧链时,位点排序明显变化:",
|
||
"",
|
||
"```text",
|
||
acyclic_diversity[
|
||
[
|
||
"cleavage_position",
|
||
"total_fragments",
|
||
"unique_fragments",
|
||
"normalized_shannon_entropy",
|
||
"mean_pairwise_tanimoto_distance",
|
||
"mean_atom_count",
|
||
]
|
||
]
|
||
.sort_values("total_fragments", ascending=False)
|
||
.head(8)
|
||
.to_string(index=False),
|
||
"```",
|
||
"",
|
||
"- 在这个更严格的口径下,`15` 位成为最主要的非环状侧链位点,随后是 `9、12、3` 位。",
|
||
"- 因此,`13、3、4、12` 的高天然多样性是真实存在的,但它主要表征的是**带环单锚点天然侧链**的富集,而不是桥连片段泄漏。",
|
||
"",
|
||
"## 论文可直接引用的中文讨论段落",
|
||
"",
|
||
paper_ready_paragraph,
|
||
"",
|
||
"## 建议的论文表述方式",
|
||
"",
|
||
"- 若讨论天然产物中的侧链多样性,可写为:`按当前代码编号,16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位,并在 6 位保留较强的设计相关多样性;若换成药化镜像编号,则对应为 6、16/15 和 7 位。`",
|
||
"- 若讨论药化半合成改造热点,可写为:`按药化镜像编号,6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点;在当前代码编号下,它们对应 13、12、4、3 位。`",
|
||
"- 若专门讨论非环状侧链设计,则应强调:`在排除 <=3 重原子小片段并进一步排除带环侧链后,15 位是最主要的非环状侧链修饰位点。`",
|
||
"- 若在图表中同时展示两套体系,建议统一写成:`代码编号 13(药化 6)` 这类双标签格式,而不要在同一表中混用单独编号。",
|
||
"",
|
||
"## 相关图表",
|
||
"",
|
||
"- 全库原子数分布:`fragment_atom_count_distribution.png`",
|
||
f"- {ring_size} 元环过滤前后位点数量对比:`ring{ring_size}_position_count_comparison.png`",
|
||
f"- {ring_size} 元环大侧链 boxplot:`ring{ring_size}_position_atom_count_boxplot_gt3.png`",
|
||
f"- {ring_size} 元环位点多样性图:`ring{ring_size}_position_diversity_gt3.png`",
|
||
f"- {ring_size} 元环桥环/带环侧链敏感性图:`ring{ring_size}_position_ring_sensitivity.png`",
|
||
f"- {ring_size} 元环药化热点对比图:`ring{ring_size}_medchem_hotspot_comparison.png`",
|
||
"",
|
||
]
|
||
return "\n".join(lines)
|
||
|
||
|
||
def build_summary_lines(
|
||
dataframe: pd.DataFrame,
|
||
ring_dataframe: pd.DataFrame,
|
||
filtered_ring_dataframe: pd.DataFrame,
|
||
filter_candidates: pd.DataFrame,
|
||
diversity_table: pd.DataFrame,
|
||
diversity_gt3: pd.DataFrame,
|
||
hotspot_table: pd.DataFrame,
|
||
ring_size: int,
|
||
design_min_atoms: int,
|
||
) -> list[str]:
|
||
atom_counts = dataframe["fragment_atom_count"]
|
||
robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20] if not diversity_gt3.empty else diversity_gt3
|
||
lines = [
|
||
f"Analyzed rows: {len(dataframe)}",
|
||
f"Unique parent molecules: {dataframe['source_parent_ml_id'].nunique()}",
|
||
f"Unique fragment smiles: {dataframe['fragment_smiles_plain'].nunique()}",
|
||
f"Fragment atom count percentiles: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, p50={atom_counts.quantile(0.5):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}",
|
||
"",
|
||
"Filter candidates (drop fragments with atom_count <= threshold):",
|
||
]
|
||
|
||
for candidate in filter_candidates.head(5).itertuples(index=False):
|
||
lines.append(
|
||
" "
|
||
f"<= {candidate.drop_if_atom_count_lte}: "
|
||
f"remove {candidate.removed_rows} rows ({candidate.removed_row_fraction:.1%}), "
|
||
f"remove {candidate.removed_unique_fragments} unique fragments ({candidate.removed_unique_fraction:.1%})"
|
||
)
|
||
|
||
lines.extend(
|
||
[
|
||
"",
|
||
f"Ring {ring_size} rows: {len(ring_dataframe)}",
|
||
f"Ring {ring_size} unique fragment smiles: {ring_dataframe['fragment_smiles_plain'].nunique()}",
|
||
f"Ring {ring_size} rows with >= {design_min_atoms} heavy atoms: {len(filtered_ring_dataframe)}",
|
||
f"Ring {ring_size} unique fragment smiles with >= {design_min_atoms} heavy atoms: {filtered_ring_dataframe['fragment_smiles_plain'].nunique()}",
|
||
"",
|
||
f"Ring {ring_size} top positions by normalized Shannon entropy:",
|
||
]
|
||
)
|
||
|
||
for row in diversity_table.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
|
||
lines.append(
|
||
" "
|
||
f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
|
||
f"unique={row.unique_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
|
||
)
|
||
|
||
lines.extend(["", f"Ring {ring_size} top positions by mean pairwise Tanimoto distance:"])
|
||
for row in diversity_table.sort_values("mean_pairwise_tanimoto_distance", ascending=False).head(5).itertuples(index=False):
|
||
lines.append(
|
||
" "
|
||
f"Position {row.cleavage_position}: distance={row.mean_pairwise_tanimoto_distance:.3f}, "
|
||
f"entropy={row.normalized_shannon_entropy:.3f}, atom_count_range={row.atom_count_range}"
|
||
)
|
||
|
||
if not robust_gt3.empty:
|
||
lines.extend(["", f"Ring {ring_size} top filtered positions by normalized Shannon entropy:"])
|
||
for row in robust_gt3.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
|
||
lines.append(
|
||
" "
|
||
f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
|
||
f"unique={row.unique_fragments}, total={row.total_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
|
||
)
|
||
|
||
lines.extend(["", "Medicinal-chemistry hotspot comparison:"])
|
||
for row in hotspot_table.itertuples(index=False):
|
||
lines.append(
|
||
" "
|
||
f"Position {row.cleavage_position}: all={row.total_fragments_all}, "
|
||
f">={design_min_atoms} atoms={row.total_fragments_gt3}, "
|
||
f"unique_filtered={row.unique_fragments_gt3}, "
|
||
f"entropy_filtered={row.normalized_shannon_entropy_gt3:.3f}"
|
||
)
|
||
|
||
lines.extend(
|
||
[
|
||
"",
|
||
"Interpretation note: atom-count spread is only a coarse proxy for diversity.",
|
||
"Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.",
|
||
"For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.",
|
||
]
|
||
)
|
||
return lines
|
||
|
||
|
||
def main(argv: list[str] | None = None) -> None:
|
||
args = build_parser().parse_args(argv)
|
||
output_dir = Path(args.output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
medchem_positions = [int(part.strip()) for part in args.medchem_positions.split(",") if part.strip()]
|
||
|
||
sns.set_theme(style="whitegrid")
|
||
|
||
dataframe = load_fragment_library_dataset(args.input, args.db)
|
||
analysis_df = dataframe[
|
||
dataframe["splice_ready"].fillna(False) & (dataframe["classification"] == "standard_macrolactone")
|
||
].copy()
|
||
ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
|
||
filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()
|
||
zero_fragment_ring_parents = build_zero_fragment_parent_table(args.db, args.ring_size)
|
||
with sqlite3.connect(args.db) as connection:
|
||
standard_success_parent_count = int(
|
||
pd.read_sql_query(
|
||
"""
|
||
SELECT COUNT(*) AS count
|
||
FROM parent_molecules
|
||
WHERE classification = 'standard_macrolactone'
|
||
AND ring_size = ?
|
||
AND processing_status = 'success'
|
||
""",
|
||
connection,
|
||
params=[args.ring_size],
|
||
).iloc[0]["count"]
|
||
)
|
||
|
||
if analysis_df.empty:
|
||
raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
|
||
if ring_df.empty:
|
||
raise ValueError(f"No splice-ready fragments found for ring size {args.ring_size}.")
|
||
|
||
atom_count_summary = build_fragment_atom_count_summary(analysis_df)
|
||
atom_count_frequency = build_fragment_atom_count_frequency_table(analysis_df)
|
||
filter_candidates = build_filter_candidate_table(analysis_df)
|
||
diversity_table = build_position_diversity_table(ring_df)
|
||
diversity_gt3 = (
|
||
build_position_diversity_table(filtered_ring_df) if not filtered_ring_df.empty else diversity_table.iloc[0:0].copy()
|
||
)
|
||
position_counts = build_position_count_comparison_table(ring_df, filtered_ring_df)
|
||
ring_sensitivity = build_ring_sensitivity_table(filtered_ring_df, position_counts)
|
||
hotspot_table = build_medchem_hotspot_table(medchem_positions, position_counts, diversity_table, diversity_gt3)
|
||
|
||
atom_count_summary.to_csv(output_dir / "fragment_atom_count_summary.csv", index=False)
|
||
atom_count_frequency.to_csv(output_dir / "fragment_atom_count_frequency.csv", index=False)
|
||
filter_candidates.to_csv(output_dir / "fragment_atom_count_filter_candidates.csv", index=False)
|
||
diversity_table.to_csv(output_dir / f"ring{args.ring_size}_position_diversity.csv", index=False)
|
||
diversity_gt3.to_csv(output_dir / f"ring{args.ring_size}_position_diversity_gt3.csv", index=False)
|
||
position_counts.to_csv(output_dir / f"ring{args.ring_size}_position_count_comparison.csv", index=False)
|
||
ring_sensitivity.to_csv(output_dir / f"ring{args.ring_size}_position_ring_sensitivity.csv", index=False)
|
||
hotspot_table.to_csv(output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.csv", index=False)
|
||
|
||
plot_fragment_atom_count_distribution(analysis_df, output_dir / "fragment_atom_count_distribution.png")
|
||
plot_ring_position_count_comparison(
|
||
position_counts,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
output_dir / f"ring{args.ring_size}_position_count_comparison.png",
|
||
)
|
||
plot_ring_position_atom_count_distribution(
|
||
ring_df, args.ring_size, output_dir / f"ring{args.ring_size}_position_atom_count_distribution.png"
|
||
)
|
||
plot_ring_position_atom_count_boxplot(
|
||
filtered_ring_df,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
output_dir / f"ring{args.ring_size}_position_atom_count_boxplot_gt3.png",
|
||
)
|
||
plot_ring_position_diversity(
|
||
diversity_table,
|
||
args.ring_size,
|
||
output_dir / f"ring{args.ring_size}_position_diversity.png",
|
||
)
|
||
plot_ring_position_diversity(
|
||
diversity_gt3,
|
||
args.ring_size,
|
||
output_dir / f"ring{args.ring_size}_position_diversity_gt3.png",
|
||
title_suffix=f"Side-Chain Diversity by Position (>= {args.design_min_atoms} Heavy Atoms)",
|
||
)
|
||
plot_ring_sensitivity(
|
||
ring_sensitivity,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
output_dir / f"ring{args.ring_size}_position_ring_sensitivity.png",
|
||
)
|
||
plot_medchem_hotspot_comparison(
|
||
hotspot_table,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.png",
|
||
)
|
||
|
||
summary_lines = build_summary_lines(
|
||
analysis_df,
|
||
ring_df,
|
||
filtered_ring_df,
|
||
filter_candidates,
|
||
diversity_table,
|
||
diversity_gt3,
|
||
hotspot_table,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
)
|
||
(output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
|
||
report = build_markdown_report(
|
||
output_dir,
|
||
analysis_df,
|
||
ring_df,
|
||
filtered_ring_df,
|
||
filter_candidates,
|
||
diversity_table,
|
||
diversity_gt3,
|
||
position_counts,
|
||
ring_sensitivity,
|
||
hotspot_table,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
)
|
||
(output_dir / "fragment_library_analysis_report.md").write_text(report + "\n", encoding="utf-8")
|
||
report_zh = build_markdown_report_zh(
|
||
analysis_df,
|
||
ring_df,
|
||
filtered_ring_df,
|
||
filter_candidates,
|
||
standard_success_parent_count,
|
||
zero_fragment_ring_parents,
|
||
diversity_gt3,
|
||
position_counts,
|
||
ring_sensitivity,
|
||
hotspot_table,
|
||
args.ring_size,
|
||
args.design_min_atoms,
|
||
)
|
||
(output_dir / "fragment_library_analysis_report_zh.md").write_text(report_zh + "\n", encoding="utf-8")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|