macro_split/scripts/analyze_validation_fragment_library.py

from __future__ import annotations

import argparse
from math import ceil
from pathlib import Path

import matplotlib

matplotlib.use("Agg")

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors

from macro_lactone_toolkit.validation.fragment_library_analysis import (
    build_filter_candidate_table,
    build_fragment_atom_count_frequency_table,
    build_fragment_atom_count_summary,
    build_position_diversity_table,
    load_fragment_library_dataset,
)


def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description="Analyze validation_output fragment_library.csv and generate atom-count and diversity reports."
    )
    parser.add_argument("--input", required=True, help="Path to validation_output/fragment_library.csv")
    parser.add_argument("--db", required=True, help="Path to validation_output/fragments.db")
    parser.add_argument("--output-dir", required=True, help="Directory to write plots and CSV summaries")
    parser.add_argument("--ring-size", type=int, default=16, help="Ring size to analyze position-wise diversity for")
    parser.add_argument(
        "--design-min-atoms",
        type=int,
        default=4,
        help="Minimum heavy-atom count to retain for design-oriented side-chain analysis",
    )
    parser.add_argument(
        "--medchem-positions",
        default="6,7,15,16",
        help="Comma-separated cleavage positions to highlight as literature or medicinal-chemistry hotspots",
    )
    return parser


def plot_fragment_atom_count_distribution(dataframe: pd.DataFrame, output_path: Path) -> None:
    figure, axes = plt.subplots(1, 2, figsize=(14, 5))

    sns.histplot(data=dataframe, x="fragment_atom_count", discrete=True, ax=axes[0], color="#3b82f6")
    axes[0].set_title("Fragment Atom Count Distribution")
    axes[0].set_xlabel("Fragment atom count")
    axes[0].set_ylabel("Fragment rows")

    sns.ecdfplot(data=dataframe, x="fragment_atom_count", ax=axes[1], color="#ef4444")
    axes[1].set_title("Fragment Atom Count ECDF")
    axes[1].set_xlabel("Fragment atom count")
    axes[1].set_ylabel("Cumulative fraction")

    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def plot_ring_position_atom_count_distribution(dataframe: pd.DataFrame, ring_size: int, output_path: Path) -> None:
    positions = sorted(dataframe["cleavage_position"].unique().tolist())
    ncols = 4
    nrows = ceil(len(positions) / ncols)
    figure, axes = plt.subplots(nrows, ncols, figsize=(16, 3.8 * nrows), sharex=True, sharey=True)
    axes_flat = axes.flatten()

    for axis, position in zip(axes_flat, positions, strict=False):
        subset = dataframe[dataframe["cleavage_position"] == position]
        sns.histplot(data=subset, x="fragment_atom_count", discrete=True, ax=axis, color="#10b981")
        axis.set_title(f"Position {position} (n={len(subset)})")
        axis.set_xlabel("Atom count")
        axis.set_ylabel("Count")

    for axis in axes_flat[len(positions) :]:
        axis.axis("off")

    figure.suptitle(f"{ring_size}-Membered Ring: Atom Count Distribution by Cleavage Position", y=1.02)
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def plot_ring_position_diversity(
    diversity_table: pd.DataFrame,
    ring_size: int,
    output_path: Path,
    title_suffix: str = "Side-Chain Diversity by Position",
) -> None:
    if diversity_table.empty:
        figure, axis = plt.subplots(figsize=(10, 4))
        axis.text(0.5, 0.5, "No fragments available after filtering.", ha="center", va="center")
        axis.set_axis_off()
        figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=0.98)
        figure.tight_layout()
        figure.savefig(output_path, dpi=300, bbox_inches="tight")
        plt.close(figure)
        return

    figure, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)

    sns.barplot(data=diversity_table, x="cleavage_position", y="unique_fragments", ax=axes[0], color="#2563eb")
    axes[0].set_title("Unique Fragment Types by Position")
    axes[0].set_ylabel("Unique fragments")

    sns.barplot(
        data=diversity_table,
        x="cleavage_position",
        y="normalized_shannon_entropy",
        ax=axes[1],
        color="#f59e0b",
    )
    axes[1].set_title("Normalized Shannon Entropy by Position")
    axes[1].set_ylabel("Normalized entropy")

    sns.barplot(
        data=diversity_table,
        x="cleavage_position",
        y="mean_pairwise_tanimoto_distance",
        ax=axes[2],
        color="#7c3aed",
    )
    axes[2].set_title("Mean Pairwise Tanimoto Distance by Position")
    axes[2].set_ylabel("Mean distance")
    axes[2].set_xlabel("Cleavage position")

    figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=1.01)
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def build_position_count_comparison_table(all_dataframe: pd.DataFrame, filtered_dataframe: pd.DataFrame) -> pd.DataFrame:
    all_stats = (
        all_dataframe.groupby("cleavage_position")
        .agg(
            total_fragments_all=("fragment_smiles_plain", "size"),
            unique_fragments_all=("fragment_smiles_plain", "nunique"),
            mean_atom_count_all=("fragment_atom_count", "mean"),
        )
        .reset_index()
    )
    filtered_stats = (
        filtered_dataframe.groupby("cleavage_position")
        .agg(
            total_fragments_gt3=("fragment_smiles_plain", "size"),
            unique_fragments_gt3=("fragment_smiles_plain", "nunique"),
            mean_atom_count_gt3=("fragment_atom_count", "mean"),
        )
        .reset_index()
    )

    merged = all_stats.merge(filtered_stats, on="cleavage_position", how="left")
    integer_columns = [
        "total_fragments_gt3",
        "unique_fragments_gt3",
    ]
    for column in integer_columns:
        merged[column] = merged[column].fillna(0).astype(int)
    merged["mean_atom_count_gt3"] = merged["mean_atom_count_gt3"].fillna(0.0)
    merged["gt3_row_fraction"] = merged["total_fragments_gt3"] / merged["total_fragments_all"]
    merged["gt3_unique_fraction"] = merged["unique_fragments_gt3"] / merged["unique_fragments_all"]
    return merged.sort_values("cleavage_position").reset_index(drop=True)


def annotate_fragment_ring_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
    result = dataframe.copy()
    if result.empty:
        result["fragment_ring_count"] = pd.Series(dtype=int)
        return result
    result["fragment_ring_count"] = result["fragment_smiles_plain"].apply(
        lambda smiles: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(smiles))
    )
    return result


def build_ring_sensitivity_table(filtered_dataframe: pd.DataFrame, position_counts: pd.DataFrame) -> pd.DataFrame:
    rows: list[dict[str, float | int]] = []
    annotated = annotate_fragment_ring_counts(filtered_dataframe)

    for position in position_counts["cleavage_position"]:
        subset = annotated[annotated["cleavage_position"] == position].copy()
        if subset.empty:
            rows.append(
                {
                    "cleavage_position": int(position),
                    "total_fragments_gt3": 0,
                    "cyclic_fragment_rows": 0,
                    "acyclic_fragment_rows": 0,
                    "cyclic_row_fraction": 0.0,
                    "acyclic_row_fraction": 0.0,
                    "unique_fragments_gt3": 0,
                    "cyclic_unique_fragments": 0,
                    "acyclic_unique_fragments": 0,
                }
            )
            continue

        unique = subset[["fragment_smiles_plain", "fragment_ring_count"]].drop_duplicates()
        cyclic_rows = int((subset["fragment_ring_count"] > 0).sum())
        acyclic_rows = int((subset["fragment_ring_count"] == 0).sum())
        rows.append(
            {
                "cleavage_position": int(position),
                "total_fragments_gt3": int(len(subset)),
                "cyclic_fragment_rows": cyclic_rows,
                "acyclic_fragment_rows": acyclic_rows,
                "cyclic_row_fraction": cyclic_rows / len(subset),
                "acyclic_row_fraction": acyclic_rows / len(subset),
                "unique_fragments_gt3": int(subset["fragment_smiles_plain"].nunique()),
                "cyclic_unique_fragments": int((unique["fragment_ring_count"] > 0).sum()),
                "acyclic_unique_fragments": int((unique["fragment_ring_count"] == 0).sum()),
            }
        )

    return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)


def build_medchem_hotspot_table(
    medchem_positions: list[int],
    position_counts: pd.DataFrame,
    diversity_all: pd.DataFrame,
    diversity_gt3: pd.DataFrame,
) -> pd.DataFrame:
    frame = pd.DataFrame({"cleavage_position": medchem_positions})
    merged = frame.merge(position_counts, on="cleavage_position", how="left")
    merged = merged.merge(
        diversity_all[
            [
                "cleavage_position",
                "normalized_shannon_entropy",
                "mean_pairwise_tanimoto_distance",
            ]
        ].rename(
            columns={
                "normalized_shannon_entropy": "normalized_shannon_entropy_all",
                "mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_all",
            }
        ),
        on="cleavage_position",
        how="left",
    )
    merged = merged.merge(
        diversity_gt3[
            [
                "cleavage_position",
                "normalized_shannon_entropy",
                "mean_pairwise_tanimoto_distance",
            ]
        ].rename(
            columns={
                "normalized_shannon_entropy": "normalized_shannon_entropy_gt3",
                "mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_gt3",
            }
        ),
        on="cleavage_position",
        how="left",
    )
    numeric_fill_zero = [
        "total_fragments_all",
        "unique_fragments_all",
        "mean_atom_count_all",
        "total_fragments_gt3",
        "unique_fragments_gt3",
        "mean_atom_count_gt3",
        "gt3_row_fraction",
        "gt3_unique_fraction",
        "normalized_shannon_entropy_all",
        "mean_pairwise_tanimoto_distance_all",
        "normalized_shannon_entropy_gt3",
        "mean_pairwise_tanimoto_distance_gt3",
    ]
    for column in numeric_fill_zero:
        merged[column] = merged[column].fillna(0.0)

    integer_columns = [
        "total_fragments_all",
        "unique_fragments_all",
        "total_fragments_gt3",
        "unique_fragments_gt3",
    ]
    for column in integer_columns:
        merged[column] = merged[column].astype(int)

    return merged.sort_values("cleavage_position").reset_index(drop=True)


def plot_ring_position_count_comparison(
    comparison_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
    output_path: Path,
) -> None:
    figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

    melted = comparison_table.melt(
        id_vars="cleavage_position",
        value_vars=["total_fragments_all", "total_fragments_gt3"],
        var_name="series",
        value_name="count",
    )
    melted["series"] = melted["series"].map(
        {
            "total_fragments_all": "All splice-ready fragments",
            "total_fragments_gt3": f"Fragments with >={design_min_atoms} heavy atoms",
        }
    )
    sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
    axes[0].set_title("Fragment Counts by Position")
    axes[0].set_ylabel("Fragment rows")
    axes[0].set_xlabel("")
    axes[0].legend(loc="upper right")

    sns.barplot(data=comparison_table, x="cleavage_position", y="gt3_row_fraction", ax=axes[1], color="#f59e0b")
    axes[1].set_title(f"Fraction of Position-Specific Fragments with >={design_min_atoms} Heavy Atoms")
    axes[1].set_ylabel("Fraction")
    axes[1].set_xlabel("Cleavage position")
    axes[1].set_ylim(0, 1.0)

    figure.suptitle(f"{ring_size}-Membered Ring: Position Counts Before and After Size Filtering", y=1.01)
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def plot_ring_position_atom_count_boxplot(
    filtered_dataframe: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
    output_path: Path,
) -> None:
    if filtered_dataframe.empty:
        figure, axis = plt.subplots(figsize=(10, 4))
        axis.text(0.5, 0.5, "No fragments satisfy the size filter.", ha="center", va="center")
        axis.set_axis_off()
        figure.suptitle(
            f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms",
            y=0.98,
        )
        figure.tight_layout()
        figure.savefig(output_path, dpi=300, bbox_inches="tight")
        plt.close(figure)
        return

    figure, axis = plt.subplots(figsize=(14, 6))
    sns.boxplot(data=filtered_dataframe, x="cleavage_position", y="fragment_atom_count", ax=axis, color="#10b981")
    axis.set_title(f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms")
    axis.set_xlabel("Cleavage position")
    axis.set_ylabel("Fragment atom count")
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def plot_medchem_hotspot_comparison(
    hotspot_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
    output_path: Path,
) -> None:
    figure, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True)
    positions = hotspot_table["cleavage_position"].astype(str)

    sns.barplot(x=positions, y=hotspot_table["total_fragments_all"], ax=axes[0, 0], color="#2563eb")
    axes[0, 0].set_title("All splice-ready fragments")
    axes[0, 0].set_ylabel("Rows")
    axes[0, 0].set_xlabel("")

    sns.barplot(x=positions, y=hotspot_table["total_fragments_gt3"], ax=axes[0, 1], color="#10b981")
    axes[0, 1].set_title(f"Fragments with >={design_min_atoms} heavy atoms")
    axes[0, 1].set_ylabel("Rows")
    axes[0, 1].set_xlabel("")

    sns.barplot(x=positions, y=hotspot_table["unique_fragments_gt3"], ax=axes[1, 0], color="#f59e0b")
    axes[1, 0].set_title(f"Unique fragments with >={design_min_atoms} heavy atoms")
    axes[1, 0].set_ylabel("Unique SMILES")
    axes[1, 0].set_xlabel("Cleavage position")

    sns.barplot(x=positions, y=hotspot_table["normalized_shannon_entropy_gt3"], ax=axes[1, 1], color="#7c3aed")
    axes[1, 1].set_title("Normalized Shannon entropy after size filtering")
    axes[1, 1].set_ylabel("Entropy")
    axes[1, 1].set_xlabel("Cleavage position")
    axes[1, 1].set_ylim(0, 1.0)

    figure.suptitle(f"{ring_size}-Membered Ring: Medicinal-Chemistry Hotspot Comparison", y=1.01)
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def plot_ring_sensitivity(
    sensitivity_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
    output_path: Path,
) -> None:
    figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

    melted = sensitivity_table.melt(
        id_vars="cleavage_position",
        value_vars=["cyclic_fragment_rows", "acyclic_fragment_rows"],
        var_name="series",
        value_name="count",
    )
    melted["series"] = melted["series"].map(
        {
            "cyclic_fragment_rows": "Ring-bearing side chains",
            "acyclic_fragment_rows": "Acyclic side chains",
        }
    )
    sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
    axes[0].set_title(f"Fragments with >={design_min_atoms} Heavy Atoms: Cyclic vs Acyclic Side Chains")
    axes[0].set_ylabel("Fragment rows")
    axes[0].set_xlabel("")
    axes[0].legend(loc="upper right")

    sns.barplot(data=sensitivity_table, x="cleavage_position", y="acyclic_row_fraction", ax=axes[1], color="#f59e0b")
    axes[1].set_title("Acyclic Fraction After Size Filtering")
    axes[1].set_ylabel("Acyclic fraction")
    axes[1].set_xlabel("Cleavage position")
    axes[1].set_ylim(0, 1.0)

    figure.suptitle(f"{ring_size}-Membered Ring: Sensitivity to Ring-Bearing Side Chains", y=1.01)
    figure.tight_layout()
    figure.savefig(output_path, dpi=300, bbox_inches="tight")
    plt.close(figure)


def format_top_positions(table: pd.DataFrame, sort_column: str, limit: int = 5) -> str:
    if table.empty:
        return "No positions available."
    subset = table.sort_values(sort_column, ascending=False).head(limit)
    return subset.to_string(index=False)


def build_markdown_report(
    output_dir: Path,
    analysis_df: pd.DataFrame,
    ring_df: pd.DataFrame,
    filtered_ring_df: pd.DataFrame,
    filter_candidates: pd.DataFrame,
    diversity_all: pd.DataFrame,
    diversity_gt3: pd.DataFrame,
    position_counts: pd.DataFrame,
    ring_sensitivity_table: pd.DataFrame,
    hotspot_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
) -> str:
    atom_counts = analysis_df["fragment_atom_count"]
    annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
    acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
    acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
    conservative_filter = filter_candidates.loc[
        filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
    ].iloc[0]
    design_filter = filter_candidates.loc[
        filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
    ].iloc[0]
    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
    top_filtered_counts = position_counts[
        ["cleavage_position", "total_fragments_gt3", "gt3_row_fraction"]
    ].sort_values("total_fragments_gt3", ascending=False).head(6)

    lines = [
        "# Fragment Library Analysis Report",
        "",
        "## Scope and Dataset",
        "",
        f"- Input fragment library: `{analysis_df['source_type'].mode().iloc[0]}` rows merged with `parent_molecules` metadata.",
        f"- Current validated design library contains **{len(analysis_df):,}** splice-ready fragment rows from **{analysis_df['source_parent_ml_id'].nunique():,}** parent macrolactones.",
        f"- The {ring_size}-membered subset contains **{len(ring_df):,}** fragment rows from **{ring_df['source_parent_ml_id'].nunique():,}** parent molecules.",
        f"- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.",
        "",
        "## Figure 1. Global Atom-Count Distribution",
        "",
        "![Fragment atom count distribution](fragment_atom_count_distribution.png)",
        "",
        f"- Heavy-atom count is extremely right-skewed: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, median={atom_counts.quantile(0.50):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}.",
        f"- A conservative cleanup filter of `<= {design_min_atoms - 2}` heavy atoms removes **{int(conservative_filter.removed_rows):,}** rows ({conservative_filter.removed_row_fraction:.1%}) but only **{int(conservative_filter.removed_unique_fragments):,}** unique fragment SMILES ({conservative_filter.removed_unique_fraction:.1%}).",
        f"- A design-oriented filter aligned with your previous analysis, `<= {design_min_atoms - 1}` heavy atoms, removes **{int(design_filter.removed_rows):,}** rows ({design_filter.removed_row_fraction:.1%}) and **{int(design_filter.removed_unique_fragments):,}** unique fragment SMILES ({design_filter.removed_unique_fraction:.1%}).",
        f"- Interpretation: `<= {design_min_atoms - 2}` is the safer default for library cleanup, while `> {design_min_atoms - 1}` is useful for positional diversity analysis because it suppresses one- and two-atom noise.",
        "",
        "## Figure 2. Ring-Specific Counts Before and After Size Filtering",
        "",
        f"![Ring {ring_size} position count comparison](ring{ring_size}_position_count_comparison.png)",
        "",
        f"- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= {design_min_atoms} heavy atoms**.",
        f"- After size filtering, the most populated {ring_size}-membered positions are:",
        "",
        "```text",
        top_filtered_counts.to_string(index=False),
        "```",
        "",
        "## Figure 3. Atom-Count Spread for Design-Relevant Fragments",
        "",
        f"![Ring {ring_size} atom-count boxplot](ring{ring_size}_position_atom_count_boxplot_gt3.png)",
        "",
        f"- This boxplot only includes fragments with **>= {design_min_atoms} heavy atoms**.",
        f"- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.",
        "",
        "## Figure 4. Position-Wise Diversity Metrics After Filtering",
        "",
        f"![Ring {ring_size} filtered diversity](ring{ring_size}_position_diversity_gt3.png)",
        "",
        "- Diversity is evaluated with three complementary metrics:",
        "  1. `unique_fragments`: raw chemotype count.",
        "  2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.",
        "  3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.",
        "",
        "Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):",
        "",
        "```text",
        format_top_positions(
            robust_gt3[
                [
                    "cleavage_position",
                    "total_fragments",
                    "unique_fragments",
                    "normalized_shannon_entropy",
                    "mean_pairwise_tanimoto_distance",
                    "mean_atom_count",
                ]
            ],
            "normalized_shannon_entropy",
            limit=8,
        ),
        "```",
        "",
        f"- Within the filtered {ring_size}-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 6, 'total_fragments_gt3'].iloc[0])}** rows) but high chemotype diversity and near-maximal entropy.",
        f"- **Position 15** is also clearly relevant because it retains **{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 15, 'total_fragments_gt3'].iloc[0])}** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.",
        "",
        "## Figure 5. Focus on Medicinal-Chemistry Hotspots",
        "",
        f"![Ring {ring_size} medchem hotspot comparison](ring{ring_size}_medchem_hotspot_comparison.png)",
        "",
        "This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.",
        "",
        "```text",
        hotspot_table[
            [
                "cleavage_position",
                "total_fragments_all",
                "total_fragments_gt3",
                "unique_fragments_gt3",
                "normalized_shannon_entropy_gt3",
                "mean_pairwise_tanimoto_distance_gt3",
            ]
        ].to_string(index=False),
        "```",
        "",
        "- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.",
        "- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.",
        "- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.",
        "- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.",
        "",
        "## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?",
        "",
        f"![Ring {ring_size} ring sensitivity](ring{ring_size}_position_ring_sensitivity.png)",
        "",
        f"- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).",
        "- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.",
        "",
        "```text",
        ring_sensitivity_table[
            [
                "cleavage_position",
                "total_fragments_gt3",
                "cyclic_fragment_rows",
                "acyclic_fragment_rows",
                "acyclic_row_fraction",
                "cyclic_unique_fragments",
                "acyclic_unique_fragments",
            ]
        ].to_string(index=False),
        "```",
        "",
        "- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.",
        "- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.",
        "- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.",
        "",
        "Acyclic-only ranking after removing <=3-heavy-atom fragments:",
        "",
        "```text",
        format_top_positions(
            acyclic_diversity[
                [
                    "cleavage_position",
                    "total_fragments",
                    "unique_fragments",
                    "normalized_shannon_entropy",
                    "mean_pairwise_tanimoto_distance",
                    "mean_atom_count",
                ]
            ].sort_values("total_fragments", ascending=False),
            "total_fragments",
            limit=8,
        ),
        "```",
        "",
        "- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.",
        "",
        "## Reconciliation With the Previous Conclusion",
        "",
        "The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.",
        "",
        "- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.",
        "- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.",
        "- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.",
        "",
        "### Recommended paper-safe wording",
        "",
        f"> In the validated MacrolactoneDB fragment library, natural side-chain diversity of {ring_size}-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.",
        "",
        "### Practical interpretation for fragment-library design",
        "",
        f"- Use `<= {design_min_atoms - 2}` heavy atoms as the **default cleanup filter** for the production fragment library.",
        f"- Use the stricter `> {design_min_atoms - 1}` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.",
        f"- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.",
        "- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.",
        "- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.",
        "",
    ]
    return "\n".join(lines)


def build_markdown_report_zh(
    analysis_df: pd.DataFrame,
    ring_df: pd.DataFrame,
    filtered_ring_df: pd.DataFrame,
    filter_candidates: pd.DataFrame,
    diversity_gt3: pd.DataFrame,
    position_counts: pd.DataFrame,
    ring_sensitivity_table: pd.DataFrame,
    hotspot_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
) -> str:
    conservative_filter = filter_candidates.loc[
        filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
    ].iloc[0]
    design_filter = filter_candidates.loc[
        filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
    ].iloc[0]
    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
    annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
    acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
    acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()

    paper_ready_paragraph = (
        f"在当前验证后的 MacrolactoneDB 片段库中，桥连或双锚点侧链不会进入当前片段库，因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。"
        f"本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链；然而，许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains（带环单锚点侧链），"
        f"这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反，当仅保留 >3 个重原子且进一步限定为非环状侧链后，位点排序转而更偏向 15、9、12 和 3 位，"
        f"说明 13、3、4、12 的优势主要反映了天然带环侧链的富集，而 15 位则更接近药物化学中常见的非环状酰基修饰位点。"
        f"因此，在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。"
    )

    lines = [
        "# 大环内酯碎片库分析报告（中文）",
        "",
        "## 数据范围",
        "",
        f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录，来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
        f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录，来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
        f"- 用于设计相关位点分析的严格子集定义为：片段重原子数 **>= {design_min_atoms}**。",
        "",
        "## 全库碎片大小结论",
        "",
        f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录（{conservative_filter.removed_row_fraction:.1%}），但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段（{conservative_filter.removed_unique_fraction:.1%}）。",
        f"- 若用于和你之前的分析口径对齐，则建议采用 `> {design_min_atoms - 1}` 重原子作为设计相关片段集合。此时会删除 **{int(design_filter.removed_rows):,}** 条记录（{design_filter.removed_row_fraction:.1%}）。",
        "",
        "## 16 元环位点结论",
        "",
        f"- 在 {ring_size} 元环中，保留所有可拼接片段时，天然侧链多样性较高的位置主要集中在 `13、3/4、12`，并且 6 位也显示出较强的设计相关多样性。",
        f"- 在 `> {design_min_atoms - 1}` 重原子子集中，样本量足够且多样性较高的位点为：",
        "",
        "```text",
        robust_gt3[
            [
                "cleavage_position",
                "total_fragments",
                "unique_fragments",
                "normalized_shannon_entropy",
                "mean_pairwise_tanimoto_distance",
                "mean_atom_count",
            ]
        ]
        .sort_values("normalized_shannon_entropy", ascending=False)
        .head(8)
        .to_string(index=False),
        "```",
        "",
        "- 其中 6 位最能支持你的药化判断：它不是最高频位点，但在设计相关大侧链中显示出很高的结构多样性。",
        "- 15 位则更偏向高频但低多样性的酰基修饰位点。",
        "",
        "## 桥环 / 稠环干扰的敏感性分析",
        "",
        "桥连或双锚点侧链不会进入当前片段库，因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说，真正的 bridge / fused multi-anchor components 已被代码层面排除。",
        "",
        "但是，需要额外区分另一类情况：**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环，因此会被保留下来，但片段自身可能包含糖环、杂环或其他环状骨架，仍然会显著影响位点多样性排名。",
        "",
        "敏感性分析结果如下：",
        "",
        "```text",
        ring_sensitivity_table[
            [
                "cleavage_position",
                "total_fragments_gt3",
                "cyclic_fragment_rows",
                "acyclic_fragment_rows",
                "acyclic_row_fraction",
                "cyclic_unique_fragments",
                "acyclic_unique_fragments",
            ]
        ].to_string(index=False),
        "```",
        "",
        "- `13` 位：`689/709` 条大侧链片段本身带环，说明该位点的高多样性主要由天然糖基/环状侧链驱动。",
        "- `4` 位：`263/269` 条大侧链片段带环，几乎同样完全由环状侧链主导。",
        "- `3` 位：`231/269` 条大侧链片段带环，带环片段占主导。",
        "- `12` 位：环状与非环状侧链并存，但环状侧链仍占多数。",
        "- `6` 位：多数保留的大侧链也带环，但仍表现出较强的多样性。",
        "- `15` 位：几乎全部是非环状侧链，因此更接近半合成药化修饰位点的特征。",
        "",
        "## 仅看非环状侧链时的位点排序",
        "",
        f"- 当只保留 `> {design_min_atoms - 1}` 重原子且 **不含环** 的侧链时，位点排序明显变化：",
        "",
        "```text",
        acyclic_diversity[
            [
                "cleavage_position",
                "total_fragments",
                "unique_fragments",
                "normalized_shannon_entropy",
                "mean_pairwise_tanimoto_distance",
                "mean_atom_count",
            ]
        ]
        .sort_values("total_fragments", ascending=False)
        .head(8)
        .to_string(index=False),
        "```",
        "",
        "- 在这个更严格的口径下，`15` 位成为最主要的非环状侧链位点，随后是 `9、12、3` 位。",
        "- 因此，`13、3、4、12` 的高天然多样性是真实存在的，但它主要表征的是**带环单锚点天然侧链**的富集，而不是桥连片段泄漏。",
        "",
        "## 论文可直接引用的中文讨论段落",
        "",
        paper_ready_paragraph,
        "",
        "## 建议的论文表述方式",
        "",
        "- 若讨论天然产物中的侧链多样性，可写为：`16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位，并在 6 位保留较强的设计相关多样性。`",
        "- 若讨论药化半合成改造热点，可写为：`6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点，其中 6 和 15 位在数据库统计中分别对应高多样性和高频率信号，而 7 和 16 位更多体现为文献指导的探索性位点。`",
        "- 若专门讨论非环状侧链设计，则应强调：`在排除 <=3 重原子小片段并进一步排除带环侧链后，15 位是最主要的非环状侧链修饰位点。`",
        "",
        "## 相关图表",
        "",
        "- 全库原子数分布：`fragment_atom_count_distribution.png`",
        f"- {ring_size} 元环过滤前后位点数量对比：`ring{ring_size}_position_count_comparison.png`",
        f"- {ring_size} 元环大侧链 boxplot：`ring{ring_size}_position_atom_count_boxplot_gt3.png`",
        f"- {ring_size} 元环位点多样性图：`ring{ring_size}_position_diversity_gt3.png`",
        f"- {ring_size} 元环桥环/带环侧链敏感性图：`ring{ring_size}_position_ring_sensitivity.png`",
        f"- {ring_size} 元环药化热点对比图：`ring{ring_size}_medchem_hotspot_comparison.png`",
        "",
    ]
    return "\n".join(lines)


def build_summary_lines(
    dataframe: pd.DataFrame,
    ring_dataframe: pd.DataFrame,
    filtered_ring_dataframe: pd.DataFrame,
    filter_candidates: pd.DataFrame,
    diversity_table: pd.DataFrame,
    diversity_gt3: pd.DataFrame,
    hotspot_table: pd.DataFrame,
    ring_size: int,
    design_min_atoms: int,
) -> list[str]:
    atom_counts = dataframe["fragment_atom_count"]
    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20] if not diversity_gt3.empty else diversity_gt3
    lines = [
        f"Analyzed rows: {len(dataframe)}",
        f"Unique parent molecules: {dataframe['source_parent_ml_id'].nunique()}",
        f"Unique fragment smiles: {dataframe['fragment_smiles_plain'].nunique()}",
        f"Fragment atom count percentiles: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, p50={atom_counts.quantile(0.5):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}",
        "",
        "Filter candidates (drop fragments with atom_count <= threshold):",
    ]

    for candidate in filter_candidates.head(5).itertuples(index=False):
        lines.append(
            "  "
            f"<= {candidate.drop_if_atom_count_lte}: "
            f"remove {candidate.removed_rows} rows ({candidate.removed_row_fraction:.1%}), "
            f"remove {candidate.removed_unique_fragments} unique fragments ({candidate.removed_unique_fraction:.1%})"
        )

    lines.extend(
        [
            "",
            f"Ring {ring_size} rows: {len(ring_dataframe)}",
            f"Ring {ring_size} unique fragment smiles: {ring_dataframe['fragment_smiles_plain'].nunique()}",
            f"Ring {ring_size} rows with >= {design_min_atoms} heavy atoms: {len(filtered_ring_dataframe)}",
            f"Ring {ring_size} unique fragment smiles with >= {design_min_atoms} heavy atoms: {filtered_ring_dataframe['fragment_smiles_plain'].nunique()}",
            "",
            f"Ring {ring_size} top positions by normalized Shannon entropy:",
        ]
    )

    for row in diversity_table.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
        lines.append(
            "  "
            f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
            f"unique={row.unique_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
        )

    lines.extend(["", f"Ring {ring_size} top positions by mean pairwise Tanimoto distance:"])
    for row in diversity_table.sort_values("mean_pairwise_tanimoto_distance", ascending=False).head(5).itertuples(index=False):
        lines.append(
            "  "
            f"Position {row.cleavage_position}: distance={row.mean_pairwise_tanimoto_distance:.3f}, "
            f"entropy={row.normalized_shannon_entropy:.3f}, atom_count_range={row.atom_count_range}"
        )

    if not robust_gt3.empty:
        lines.extend(["", f"Ring {ring_size} top filtered positions by normalized Shannon entropy:"])
        for row in robust_gt3.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
            lines.append(
                "  "
                f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
                f"unique={row.unique_fragments}, total={row.total_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
            )

    lines.extend(["", "Medicinal-chemistry hotspot comparison:"])
    for row in hotspot_table.itertuples(index=False):
        lines.append(
            "  "
            f"Position {row.cleavage_position}: all={row.total_fragments_all}, "
            f">={design_min_atoms} atoms={row.total_fragments_gt3}, "
            f"unique_filtered={row.unique_fragments_gt3}, "
            f"entropy_filtered={row.normalized_shannon_entropy_gt3:.3f}"
        )

    lines.extend(
        [
            "",
            "Interpretation note: atom-count spread is only a coarse proxy for diversity.",
            "Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.",
            "For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.",
        ]
    )
    return lines


def main(argv: list[str] | None = None) -> None:
    args = build_parser().parse_args(argv)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    medchem_positions = [int(part.strip()) for part in args.medchem_positions.split(",") if part.strip()]

    sns.set_theme(style="whitegrid")

    dataframe = load_fragment_library_dataset(args.input, args.db)
    analysis_df = dataframe[
        dataframe["splice_ready"].fillna(False) & (dataframe["classification"] == "standard_macrolactone")
    ].copy()
    ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
    filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()

    if analysis_df.empty:
        raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
    if ring_df.empty:
        raise ValueError(f"No splice-ready fragments found for ring size {args.ring_size}.")

    atom_count_summary = build_fragment_atom_count_summary(analysis_df)
    atom_count_frequency = build_fragment_atom_count_frequency_table(analysis_df)
    filter_candidates = build_filter_candidate_table(analysis_df)
    diversity_table = build_position_diversity_table(ring_df)
    diversity_gt3 = (
        build_position_diversity_table(filtered_ring_df) if not filtered_ring_df.empty else diversity_table.iloc[0:0].copy()
    )
    position_counts = build_position_count_comparison_table(ring_df, filtered_ring_df)
    ring_sensitivity = build_ring_sensitivity_table(filtered_ring_df, position_counts)
    hotspot_table = build_medchem_hotspot_table(medchem_positions, position_counts, diversity_table, diversity_gt3)

    atom_count_summary.to_csv(output_dir / "fragment_atom_count_summary.csv", index=False)
    atom_count_frequency.to_csv(output_dir / "fragment_atom_count_frequency.csv", index=False)
    filter_candidates.to_csv(output_dir / "fragment_atom_count_filter_candidates.csv", index=False)
    diversity_table.to_csv(output_dir / f"ring{args.ring_size}_position_diversity.csv", index=False)
    diversity_gt3.to_csv(output_dir / f"ring{args.ring_size}_position_diversity_gt3.csv", index=False)
    position_counts.to_csv(output_dir / f"ring{args.ring_size}_position_count_comparison.csv", index=False)
    ring_sensitivity.to_csv(output_dir / f"ring{args.ring_size}_position_ring_sensitivity.csv", index=False)
    hotspot_table.to_csv(output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.csv", index=False)

    plot_fragment_atom_count_distribution(analysis_df, output_dir / "fragment_atom_count_distribution.png")
    plot_ring_position_count_comparison(
        position_counts,
        args.ring_size,
        args.design_min_atoms,
        output_dir / f"ring{args.ring_size}_position_count_comparison.png",
    )
    plot_ring_position_atom_count_distribution(
        ring_df, args.ring_size, output_dir / f"ring{args.ring_size}_position_atom_count_distribution.png"
    )
    plot_ring_position_atom_count_boxplot(
        filtered_ring_df,
        args.ring_size,
        args.design_min_atoms,
        output_dir / f"ring{args.ring_size}_position_atom_count_boxplot_gt3.png",
    )
    plot_ring_position_diversity(
        diversity_table,
        args.ring_size,
        output_dir / f"ring{args.ring_size}_position_diversity.png",
    )
    plot_ring_position_diversity(
        diversity_gt3,
        args.ring_size,
        output_dir / f"ring{args.ring_size}_position_diversity_gt3.png",
        title_suffix=f"Side-Chain Diversity by Position (>= {args.design_min_atoms} Heavy Atoms)",
    )
    plot_ring_sensitivity(
        ring_sensitivity,
        args.ring_size,
        args.design_min_atoms,
        output_dir / f"ring{args.ring_size}_position_ring_sensitivity.png",
    )
    plot_medchem_hotspot_comparison(
        hotspot_table,
        args.ring_size,
        args.design_min_atoms,
        output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.png",
    )

    summary_lines = build_summary_lines(
        analysis_df,
        ring_df,
        filtered_ring_df,
        filter_candidates,
        diversity_table,
        diversity_gt3,
        hotspot_table,
        args.ring_size,
        args.design_min_atoms,
    )
    (output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
    report = build_markdown_report(
        output_dir,
        analysis_df,
        ring_df,
        filtered_ring_df,
        filter_candidates,
        diversity_table,
        diversity_gt3,
        position_counts,
        ring_sensitivity,
        hotspot_table,
        args.ring_size,
        args.design_min_atoms,
    )
    (output_dir / "fragment_library_analysis_report.md").write_text(report + "\n", encoding="utf-8")
    report_zh = build_markdown_report_zh(
        analysis_df,
        ring_df,
        filtered_ring_df,
        filter_candidates,
        diversity_gt3,
        position_counts,
        ring_sensitivity,
        hotspot_table,
        args.ring_size,
        args.design_min_atoms,
    )
    (output_dir / "fragment_library_analysis_report_zh.md").write_text(report_zh + "\n", encoding="utf-8")


if __name__ == "__main__":
    main()