feat(validation): archive key result assets

Keep key validation outputs and analysis tables tracked directly, package analysis plot PNGs into a small tar.gz backup, and add analysis scripts plus tests so the stored results remain reproducible without flooding git with large image trees.
2026-03-19 21:34:27 +08:00
20 changed files with 36723 additions and 0 deletions
--- a/pixi.toml
+++ b/pixi.toml
@@ -19,6 +19,7 @@ rdkit = ">=2025.9.1,<2026"
 pandas = ">=2.3.3,<3"
 numpy = ">=2.3.4,<3"
 matplotlib = ">=3.10,<4"
+seaborn = ">=0.13,<0.14"
 sqlmodel = ">=0.0.37,<0.0.38"

 [pypi-dependencies]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
  "matplotlib>=3.10",
  "numpy>=1.26",
  "pandas>=2.2",
+  "seaborn>=0.13",
 ]

 [project.scripts]
--- a/scripts/analyze_fragment_atom_counts.py
+++ b/scripts/analyze_fragment_atom_counts.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+分析 fragment_library.csv 中碎片的原子数分布。
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from rdkit import Chem
+import os
+import sys
+
+
+def count_atoms(smiles):
+    """计算 SMILES 中的原子数（不包括 dummy atom）"""
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        # 计算所有原子，但排除 dummy atom（原子序数为 0）
+        atom_count = sum(1 for atom in mol.GetAtoms() if atom.GetAtomicNum() != 0)
+        return atom_count
+    except:
+        return None
+
+
+def main():
+    # 读取 CSV 文件
+    csv_path = "validation_output/fragment_library.csv"
+    if not os.path.exists(csv_path):
+        print(f"文件不存在: {csv_path}")
+        sys.exit(1)
+
+    df = pd.read_csv(csv_path)
+    print(f"总碎片数: {len(df)}")
+
+    # 计算原子数
+    df["atom_count"] = df["fragment_smiles_plain"].apply(count_atoms)
+
+    # 检查是否有无效的 SMILES
+    invalid_count = df["atom_count"].isna().sum()
+    if invalid_count > 0:
+        print(f"无效 SMILES 数量: {invalid_count}")
+
+    # 过滤掉无效的 SMILES
+    df_valid = df.dropna(subset=["atom_count"])
+    print(f"有效碎片数: {len(df_valid)}")
+
+    # 统计描述
+    print("\n原子数分布统计:")
+    print(df_valid["atom_count"].describe())
+
+    # 绘制分布图
+    plt.figure(figsize=(10, 6))
+    sns.histplot(
+        df_valid["atom_count"],
+        bins=range(0, int(df_valid["atom_count"].max()) + 2),
+        kde=False,
+    )
+    plt.title("Fragment Atom Count Distribution")
+    plt.xlabel("Atom Count")
+    plt.ylabel("Frequency")
+    plt.grid(True, alpha=0.3)
+
+    # 保存图片
+    output_dir = "validation_output"
+    os.makedirs(output_dir, exist_ok=True)
+    plot_path = os.path.join(output_dir, "fragment_atom_count_distribution.png")
+    plt.savefig(plot_path, dpi=300, bbox_inches="tight")
+    print(f"\n分布图已保存至: {plot_path}")
+
+    # 显示图片（如果在交互式环境中）
+    try:
+        plt.show()
+    except:
+        pass
+
+    # 分析不同原子数的碎片数量
+    atom_count_stats = df_valid["atom_count"].value_counts().sort_index()
+    print("\n不同原子数的碎片数量:")
+    for atom_count, count in atom_count_stats.items():
+        print(f"  原子数 {atom_count}: {count} 个碎片")
+
+    # 计算累积百分比
+    total_fragments = len(df_valid)
+    cumulative = 0
+    print("\n累积分布:")
+    for atom_count, count in atom_count_stats.items():
+        cumulative += count
+        percentage = (cumulative / total_fragments) * 100
+        print(f"  原子数 <= {atom_count}: {cumulative} 个碎片 ({percentage:.2f}%)")
+
+    # 建议筛选标准
+    print("\n建议筛选标准:")
+    # 例如，过滤掉原子数小于 3 的碎片
+    min_atom_count = 3
+    filtered_count = len(df_valid[df_valid["atom_count"] >= min_atom_count])
+    filtered_percentage = (filtered_count / total_fragments) * 100
+    print(
+        f"  如果过滤掉原子数 < {min_atom_count} 的碎片，剩余 {filtered_count} 个碎片 ({filtered_percentage:.2f}%)"
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/analyze_validation_fragment_library.py
+++ b/scripts/analyze_validation_fragment_library.py
@@ -0,0 +1,986 @@
+from __future__ import annotations
+
+import argparse
+from math import ceil
+from pathlib import Path
+
+import matplotlib
+
+matplotlib.use("Agg")
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+
+from macro_lactone_toolkit.validation.fragment_library_analysis import (
+    build_filter_candidate_table,
+    build_fragment_atom_count_frequency_table,
+    build_fragment_atom_count_summary,
+    build_position_diversity_table,
+    load_fragment_library_dataset,
+)
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Analyze validation_output fragment_library.csv and generate atom-count and diversity reports."
+    )
+    parser.add_argument("--input", required=True, help="Path to validation_output/fragment_library.csv")
+    parser.add_argument("--db", required=True, help="Path to validation_output/fragments.db")
+    parser.add_argument("--output-dir", required=True, help="Directory to write plots and CSV summaries")
+    parser.add_argument("--ring-size", type=int, default=16, help="Ring size to analyze position-wise diversity for")
+    parser.add_argument(
+        "--design-min-atoms",
+        type=int,
+        default=4,
+        help="Minimum heavy-atom count to retain for design-oriented side-chain analysis",
+    )
+    parser.add_argument(
+        "--medchem-positions",
+        default="6,7,15,16",
+        help="Comma-separated cleavage positions to highlight as literature or medicinal-chemistry hotspots",
+    )
+    return parser
+
+
+def plot_fragment_atom_count_distribution(dataframe: pd.DataFrame, output_path: Path) -> None:
+    figure, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+    sns.histplot(data=dataframe, x="fragment_atom_count", discrete=True, ax=axes[0], color="#3b82f6")
+    axes[0].set_title("Fragment Atom Count Distribution")
+    axes[0].set_xlabel("Fragment atom count")
+    axes[0].set_ylabel("Fragment rows")
+
+    sns.ecdfplot(data=dataframe, x="fragment_atom_count", ax=axes[1], color="#ef4444")
+    axes[1].set_title("Fragment Atom Count ECDF")
+    axes[1].set_xlabel("Fragment atom count")
+    axes[1].set_ylabel("Cumulative fraction")
+
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def plot_ring_position_atom_count_distribution(dataframe: pd.DataFrame, ring_size: int, output_path: Path) -> None:
+    positions = sorted(dataframe["cleavage_position"].unique().tolist())
+    ncols = 4
+    nrows = ceil(len(positions) / ncols)
+    figure, axes = plt.subplots(nrows, ncols, figsize=(16, 3.8 * nrows), sharex=True, sharey=True)
+    axes_flat = axes.flatten()
+
+    for axis, position in zip(axes_flat, positions, strict=False):
+        subset = dataframe[dataframe["cleavage_position"] == position]
+        sns.histplot(data=subset, x="fragment_atom_count", discrete=True, ax=axis, color="#10b981")
+        axis.set_title(f"Position {position} (n={len(subset)})")
+        axis.set_xlabel("Atom count")
+        axis.set_ylabel("Count")
+
+    for axis in axes_flat[len(positions) :]:
+        axis.axis("off")
+
+    figure.suptitle(f"{ring_size}-Membered Ring: Atom Count Distribution by Cleavage Position", y=1.02)
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def plot_ring_position_diversity(
+    diversity_table: pd.DataFrame,
+    ring_size: int,
+    output_path: Path,
+    title_suffix: str = "Side-Chain Diversity by Position",
+) -> None:
+    if diversity_table.empty:
+        figure, axis = plt.subplots(figsize=(10, 4))
+        axis.text(0.5, 0.5, "No fragments available after filtering.", ha="center", va="center")
+        axis.set_axis_off()
+        figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=0.98)
+        figure.tight_layout()
+        figure.savefig(output_path, dpi=300, bbox_inches="tight")
+        plt.close(figure)
+        return
+
+    figure, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)
+
+    sns.barplot(data=diversity_table, x="cleavage_position", y="unique_fragments", ax=axes[0], color="#2563eb")
+    axes[0].set_title("Unique Fragment Types by Position")
+    axes[0].set_ylabel("Unique fragments")
+
+    sns.barplot(
+        data=diversity_table,
+        x="cleavage_position",
+        y="normalized_shannon_entropy",
+        ax=axes[1],
+        color="#f59e0b",
+    )
+    axes[1].set_title("Normalized Shannon Entropy by Position")
+    axes[1].set_ylabel("Normalized entropy")
+
+    sns.barplot(
+        data=diversity_table,
+        x="cleavage_position",
+        y="mean_pairwise_tanimoto_distance",
+        ax=axes[2],
+        color="#7c3aed",
+    )
+    axes[2].set_title("Mean Pairwise Tanimoto Distance by Position")
+    axes[2].set_ylabel("Mean distance")
+    axes[2].set_xlabel("Cleavage position")
+
+    figure.suptitle(f"{ring_size}-Membered Ring: {title_suffix}", y=1.01)
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def build_position_count_comparison_table(all_dataframe: pd.DataFrame, filtered_dataframe: pd.DataFrame) -> pd.DataFrame:
+    all_stats = (
+        all_dataframe.groupby("cleavage_position")
+        .agg(
+            total_fragments_all=("fragment_smiles_plain", "size"),
+            unique_fragments_all=("fragment_smiles_plain", "nunique"),
+            mean_atom_count_all=("fragment_atom_count", "mean"),
+        )
+        .reset_index()
+    )
+    filtered_stats = (
+        filtered_dataframe.groupby("cleavage_position")
+        .agg(
+            total_fragments_gt3=("fragment_smiles_plain", "size"),
+            unique_fragments_gt3=("fragment_smiles_plain", "nunique"),
+            mean_atom_count_gt3=("fragment_atom_count", "mean"),
+        )
+        .reset_index()
+    )
+
+    merged = all_stats.merge(filtered_stats, on="cleavage_position", how="left")
+    integer_columns = [
+        "total_fragments_gt3",
+        "unique_fragments_gt3",
+    ]
+    for column in integer_columns:
+        merged[column] = merged[column].fillna(0).astype(int)
+    merged["mean_atom_count_gt3"] = merged["mean_atom_count_gt3"].fillna(0.0)
+    merged["gt3_row_fraction"] = merged["total_fragments_gt3"] / merged["total_fragments_all"]
+    merged["gt3_unique_fraction"] = merged["unique_fragments_gt3"] / merged["unique_fragments_all"]
+    return merged.sort_values("cleavage_position").reset_index(drop=True)
+
+
+def annotate_fragment_ring_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
+    result = dataframe.copy()
+    if result.empty:
+        result["fragment_ring_count"] = pd.Series(dtype=int)
+        return result
+    result["fragment_ring_count"] = result["fragment_smiles_plain"].apply(
+        lambda smiles: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(smiles))
+    )
+    return result
+
+
+def build_ring_sensitivity_table(filtered_dataframe: pd.DataFrame, position_counts: pd.DataFrame) -> pd.DataFrame:
+    rows: list[dict[str, float | int]] = []
+    annotated = annotate_fragment_ring_counts(filtered_dataframe)
+
+    for position in position_counts["cleavage_position"]:
+        subset = annotated[annotated["cleavage_position"] == position].copy()
+        if subset.empty:
+            rows.append(
+                {
+                    "cleavage_position": int(position),
+                    "total_fragments_gt3": 0,
+                    "cyclic_fragment_rows": 0,
+                    "acyclic_fragment_rows": 0,
+                    "cyclic_row_fraction": 0.0,
+                    "acyclic_row_fraction": 0.0,
+                    "unique_fragments_gt3": 0,
+                    "cyclic_unique_fragments": 0,
+                    "acyclic_unique_fragments": 0,
+                }
+            )
+            continue
+
+        unique = subset[["fragment_smiles_plain", "fragment_ring_count"]].drop_duplicates()
+        cyclic_rows = int((subset["fragment_ring_count"] > 0).sum())
+        acyclic_rows = int((subset["fragment_ring_count"] == 0).sum())
+        rows.append(
+            {
+                "cleavage_position": int(position),
+                "total_fragments_gt3": int(len(subset)),
+                "cyclic_fragment_rows": cyclic_rows,
+                "acyclic_fragment_rows": acyclic_rows,
+                "cyclic_row_fraction": cyclic_rows / len(subset),
+                "acyclic_row_fraction": acyclic_rows / len(subset),
+                "unique_fragments_gt3": int(subset["fragment_smiles_plain"].nunique()),
+                "cyclic_unique_fragments": int((unique["fragment_ring_count"] > 0).sum()),
+                "acyclic_unique_fragments": int((unique["fragment_ring_count"] == 0).sum()),
+            }
+        )
+
+    return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)
+
+
+def build_medchem_hotspot_table(
+    medchem_positions: list[int],
+    position_counts: pd.DataFrame,
+    diversity_all: pd.DataFrame,
+    diversity_gt3: pd.DataFrame,
+) -> pd.DataFrame:
+    frame = pd.DataFrame({"cleavage_position": medchem_positions})
+    merged = frame.merge(position_counts, on="cleavage_position", how="left")
+    merged = merged.merge(
+        diversity_all[
+            [
+                "cleavage_position",
+                "normalized_shannon_entropy",
+                "mean_pairwise_tanimoto_distance",
+            ]
+        ].rename(
+            columns={
+                "normalized_shannon_entropy": "normalized_shannon_entropy_all",
+                "mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_all",
+            }
+        ),
+        on="cleavage_position",
+        how="left",
+    )
+    merged = merged.merge(
+        diversity_gt3[
+            [
+                "cleavage_position",
+                "normalized_shannon_entropy",
+                "mean_pairwise_tanimoto_distance",
+            ]
+        ].rename(
+            columns={
+                "normalized_shannon_entropy": "normalized_shannon_entropy_gt3",
+                "mean_pairwise_tanimoto_distance": "mean_pairwise_tanimoto_distance_gt3",
+            }
+        ),
+        on="cleavage_position",
+        how="left",
+    )
+    numeric_fill_zero = [
+        "total_fragments_all",
+        "unique_fragments_all",
+        "mean_atom_count_all",
+        "total_fragments_gt3",
+        "unique_fragments_gt3",
+        "mean_atom_count_gt3",
+        "gt3_row_fraction",
+        "gt3_unique_fraction",
+        "normalized_shannon_entropy_all",
+        "mean_pairwise_tanimoto_distance_all",
+        "normalized_shannon_entropy_gt3",
+        "mean_pairwise_tanimoto_distance_gt3",
+    ]
+    for column in numeric_fill_zero:
+        merged[column] = merged[column].fillna(0.0)
+
+    integer_columns = [
+        "total_fragments_all",
+        "unique_fragments_all",
+        "total_fragments_gt3",
+        "unique_fragments_gt3",
+    ]
+    for column in integer_columns:
+        merged[column] = merged[column].astype(int)
+
+    return merged.sort_values("cleavage_position").reset_index(drop=True)
+
+
+def plot_ring_position_count_comparison(
+    comparison_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+    output_path: Path,
+) -> None:
+    figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
+
+    melted = comparison_table.melt(
+        id_vars="cleavage_position",
+        value_vars=["total_fragments_all", "total_fragments_gt3"],
+        var_name="series",
+        value_name="count",
+    )
+    melted["series"] = melted["series"].map(
+        {
+            "total_fragments_all": "All splice-ready fragments",
+            "total_fragments_gt3": f"Fragments with >={design_min_atoms} heavy atoms",
+        }
+    )
+    sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
+    axes[0].set_title("Fragment Counts by Position")
+    axes[0].set_ylabel("Fragment rows")
+    axes[0].set_xlabel("")
+    axes[0].legend(loc="upper right")
+
+    sns.barplot(data=comparison_table, x="cleavage_position", y="gt3_row_fraction", ax=axes[1], color="#f59e0b")
+    axes[1].set_title(f"Fraction of Position-Specific Fragments with >={design_min_atoms} Heavy Atoms")
+    axes[1].set_ylabel("Fraction")
+    axes[1].set_xlabel("Cleavage position")
+    axes[1].set_ylim(0, 1.0)
+
+    figure.suptitle(f"{ring_size}-Membered Ring: Position Counts Before and After Size Filtering", y=1.01)
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def plot_ring_position_atom_count_boxplot(
+    filtered_dataframe: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+    output_path: Path,
+) -> None:
+    if filtered_dataframe.empty:
+        figure, axis = plt.subplots(figsize=(10, 4))
+        axis.text(0.5, 0.5, "No fragments satisfy the size filter.", ha="center", va="center")
+        axis.set_axis_off()
+        figure.suptitle(
+            f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms",
+            y=0.98,
+        )
+        figure.tight_layout()
+        figure.savefig(output_path, dpi=300, bbox_inches="tight")
+        plt.close(figure)
+        return
+
+    figure, axis = plt.subplots(figsize=(14, 6))
+    sns.boxplot(data=filtered_dataframe, x="cleavage_position", y="fragment_atom_count", ax=axis, color="#10b981")
+    axis.set_title(f"{ring_size}-Membered Ring: Atom Count Distribution for Fragments with >={design_min_atoms} Heavy Atoms")
+    axis.set_xlabel("Cleavage position")
+    axis.set_ylabel("Fragment atom count")
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def plot_medchem_hotspot_comparison(
+    hotspot_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+    output_path: Path,
+) -> None:
+    figure, axes = plt.subplots(2, 2, figsize=(14, 9), sharex=True)
+    positions = hotspot_table["cleavage_position"].astype(str)
+
+    sns.barplot(x=positions, y=hotspot_table["total_fragments_all"], ax=axes[0, 0], color="#2563eb")
+    axes[0, 0].set_title("All splice-ready fragments")
+    axes[0, 0].set_ylabel("Rows")
+    axes[0, 0].set_xlabel("")
+
+    sns.barplot(x=positions, y=hotspot_table["total_fragments_gt3"], ax=axes[0, 1], color="#10b981")
+    axes[0, 1].set_title(f"Fragments with >={design_min_atoms} heavy atoms")
+    axes[0, 1].set_ylabel("Rows")
+    axes[0, 1].set_xlabel("")
+
+    sns.barplot(x=positions, y=hotspot_table["unique_fragments_gt3"], ax=axes[1, 0], color="#f59e0b")
+    axes[1, 0].set_title(f"Unique fragments with >={design_min_atoms} heavy atoms")
+    axes[1, 0].set_ylabel("Unique SMILES")
+    axes[1, 0].set_xlabel("Cleavage position")
+
+    sns.barplot(x=positions, y=hotspot_table["normalized_shannon_entropy_gt3"], ax=axes[1, 1], color="#7c3aed")
+    axes[1, 1].set_title("Normalized Shannon entropy after size filtering")
+    axes[1, 1].set_ylabel("Entropy")
+    axes[1, 1].set_xlabel("Cleavage position")
+    axes[1, 1].set_ylim(0, 1.0)
+
+    figure.suptitle(f"{ring_size}-Membered Ring: Medicinal-Chemistry Hotspot Comparison", y=1.01)
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def plot_ring_sensitivity(
+    sensitivity_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+    output_path: Path,
+) -> None:
+    figure, axes = plt.subplots(2, 1, figsize=(14, 10), sharex=True)
+
+    melted = sensitivity_table.melt(
+        id_vars="cleavage_position",
+        value_vars=["cyclic_fragment_rows", "acyclic_fragment_rows"],
+        var_name="series",
+        value_name="count",
+    )
+    melted["series"] = melted["series"].map(
+        {
+            "cyclic_fragment_rows": "Ring-bearing side chains",
+            "acyclic_fragment_rows": "Acyclic side chains",
+        }
+    )
+    sns.barplot(data=melted, x="cleavage_position", y="count", hue="series", ax=axes[0])
+    axes[0].set_title(f"Fragments with >={design_min_atoms} Heavy Atoms: Cyclic vs Acyclic Side Chains")
+    axes[0].set_ylabel("Fragment rows")
+    axes[0].set_xlabel("")
+    axes[0].legend(loc="upper right")
+
+    sns.barplot(data=sensitivity_table, x="cleavage_position", y="acyclic_row_fraction", ax=axes[1], color="#f59e0b")
+    axes[1].set_title("Acyclic Fraction After Size Filtering")
+    axes[1].set_ylabel("Acyclic fraction")
+    axes[1].set_xlabel("Cleavage position")
+    axes[1].set_ylim(0, 1.0)
+
+    figure.suptitle(f"{ring_size}-Membered Ring: Sensitivity to Ring-Bearing Side Chains", y=1.01)
+    figure.tight_layout()
+    figure.savefig(output_path, dpi=300, bbox_inches="tight")
+    plt.close(figure)
+
+
+def format_top_positions(table: pd.DataFrame, sort_column: str, limit: int = 5) -> str:
+    if table.empty:
+        return "No positions available."
+    subset = table.sort_values(sort_column, ascending=False).head(limit)
+    return subset.to_string(index=False)
+
+
+def build_markdown_report(
+    output_dir: Path,
+    analysis_df: pd.DataFrame,
+    ring_df: pd.DataFrame,
+    filtered_ring_df: pd.DataFrame,
+    filter_candidates: pd.DataFrame,
+    diversity_all: pd.DataFrame,
+    diversity_gt3: pd.DataFrame,
+    position_counts: pd.DataFrame,
+    ring_sensitivity_table: pd.DataFrame,
+    hotspot_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+) -> str:
+    atom_counts = analysis_df["fragment_atom_count"]
+    annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
+    acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
+    acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
+    conservative_filter = filter_candidates.loc[
+        filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
+    ].iloc[0]
+    design_filter = filter_candidates.loc[
+        filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
+    ].iloc[0]
+    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
+    top_filtered_counts = position_counts[
+        ["cleavage_position", "total_fragments_gt3", "gt3_row_fraction"]
+    ].sort_values("total_fragments_gt3", ascending=False).head(6)
+
+    lines = [
+        "# Fragment Library Analysis Report",
+        "",
+        "## Scope and Dataset",
+        "",
+        f"- Input fragment library: `{analysis_df['source_type'].mode().iloc[0]}` rows merged with `parent_molecules` metadata.",
+        f"- Current validated design library contains **{len(analysis_df):,}** splice-ready fragment rows from **{analysis_df['source_parent_ml_id'].nunique():,}** parent macrolactones.",
+        f"- The {ring_size}-membered subset contains **{len(ring_df):,}** fragment rows from **{ring_df['source_parent_ml_id'].nunique():,}** parent molecules.",
+        f"- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.",
+        "",
+        "## Figure 1. Global Atom-Count Distribution",
+        "",
+        "![Fragment atom count distribution](fragment_atom_count_distribution.png)",
+        "",
+        f"- Heavy-atom count is extremely right-skewed: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, median={atom_counts.quantile(0.50):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}.",
+        f"- A conservative cleanup filter of `<= {design_min_atoms - 2}` heavy atoms removes **{int(conservative_filter.removed_rows):,}** rows ({conservative_filter.removed_row_fraction:.1%}) but only **{int(conservative_filter.removed_unique_fragments):,}** unique fragment SMILES ({conservative_filter.removed_unique_fraction:.1%}).",
+        f"- A design-oriented filter aligned with your previous analysis, `<= {design_min_atoms - 1}` heavy atoms, removes **{int(design_filter.removed_rows):,}** rows ({design_filter.removed_row_fraction:.1%}) and **{int(design_filter.removed_unique_fragments):,}** unique fragment SMILES ({design_filter.removed_unique_fraction:.1%}).",
+        f"- Interpretation: `<= {design_min_atoms - 2}` is the safer default for library cleanup, while `> {design_min_atoms - 1}` is useful for positional diversity analysis because it suppresses one- and two-atom noise.",
+        "",
+        "## Figure 2. Ring-Specific Counts Before and After Size Filtering",
+        "",
+        f"![Ring {ring_size} position count comparison](ring{ring_size}_position_count_comparison.png)",
+        "",
+        f"- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= {design_min_atoms} heavy atoms**.",
+        f"- After size filtering, the most populated {ring_size}-membered positions are:",
+        "",
+        "```text",
+        top_filtered_counts.to_string(index=False),
+        "```",
+        "",
+        "## Figure 3. Atom-Count Spread for Design-Relevant Fragments",
+        "",
+        f"![Ring {ring_size} atom-count boxplot](ring{ring_size}_position_atom_count_boxplot_gt3.png)",
+        "",
+        f"- This boxplot only includes fragments with **>= {design_min_atoms} heavy atoms**.",
+        f"- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.",
+        "",
+        "## Figure 4. Position-Wise Diversity Metrics After Filtering",
+        "",
+        f"![Ring {ring_size} filtered diversity](ring{ring_size}_position_diversity_gt3.png)",
+        "",
+        "- Diversity is evaluated with three complementary metrics:",
+        "  1. `unique_fragments`: raw chemotype count.",
+        "  2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.",
+        "  3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.",
+        "",
+        "Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):",
+        "",
+        "```text",
+        format_top_positions(
+            robust_gt3[
+                [
+                    "cleavage_position",
+                    "total_fragments",
+                    "unique_fragments",
+                    "normalized_shannon_entropy",
+                    "mean_pairwise_tanimoto_distance",
+                    "mean_atom_count",
+                ]
+            ],
+            "normalized_shannon_entropy",
+            limit=8,
+        ),
+        "```",
+        "",
+        f"- Within the filtered {ring_size}-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 6, 'total_fragments_gt3'].iloc[0])}** rows) but high chemotype diversity and near-maximal entropy.",
+        f"- **Position 15** is also clearly relevant because it retains **{int(hotspot_table.loc[hotspot_table['cleavage_position'] == 15, 'total_fragments_gt3'].iloc[0])}** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.",
+        "",
+        "## Figure 5. Focus on Medicinal-Chemistry Hotspots",
+        "",
+        f"![Ring {ring_size} medchem hotspot comparison](ring{ring_size}_medchem_hotspot_comparison.png)",
+        "",
+        "This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.",
+        "",
+        "```text",
+        hotspot_table[
+            [
+                "cleavage_position",
+                "total_fragments_all",
+                "total_fragments_gt3",
+                "unique_fragments_gt3",
+                "normalized_shannon_entropy_gt3",
+                "mean_pairwise_tanimoto_distance_gt3",
+            ]
+        ].to_string(index=False),
+        "```",
+        "",
+        "- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.",
+        "- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.",
+        "- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.",
+        "- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.",
+        "",
+        "## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?",
+        "",
+        f"![Ring {ring_size} ring sensitivity](ring{ring_size}_position_ring_sensitivity.png)",
+        "",
+        f"- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).",
+        "- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.",
+        "",
+        "```text",
+        ring_sensitivity_table[
+            [
+                "cleavage_position",
+                "total_fragments_gt3",
+                "cyclic_fragment_rows",
+                "acyclic_fragment_rows",
+                "acyclic_row_fraction",
+                "cyclic_unique_fragments",
+                "acyclic_unique_fragments",
+            ]
+        ].to_string(index=False),
+        "```",
+        "",
+        "- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.",
+        "- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.",
+        "- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.",
+        "",
+        "Acyclic-only ranking after removing <=3-heavy-atom fragments:",
+        "",
+        "```text",
+        format_top_positions(
+            acyclic_diversity[
+                [
+                    "cleavage_position",
+                    "total_fragments",
+                    "unique_fragments",
+                    "normalized_shannon_entropy",
+                    "mean_pairwise_tanimoto_distance",
+                    "mean_atom_count",
+                ]
+            ].sort_values("total_fragments", ascending=False),
+            "total_fragments",
+            limit=8,
+        ),
+        "```",
+        "",
+        "- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.",
+        "",
+        "## Reconciliation With the Previous Conclusion",
+        "",
+        "The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.",
+        "",
+        "- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.",
+        "- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.",
+        "- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.",
+        "",
+        "### Recommended paper-safe wording",
+        "",
+        f"> In the validated MacrolactoneDB fragment library, natural side-chain diversity of {ring_size}-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.",
+        "",
+        "### Practical interpretation for fragment-library design",
+        "",
+        f"- Use `<= {design_min_atoms - 2}` heavy atoms as the **default cleanup filter** for the production fragment library.",
+        f"- Use the stricter `> {design_min_atoms - 1}` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.",
+        f"- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.",
+        "- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.",
+        "- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.",
+        "",
+    ]
+    return "\n".join(lines)
+
+
+def build_markdown_report_zh(
+    analysis_df: pd.DataFrame,
+    ring_df: pd.DataFrame,
+    filtered_ring_df: pd.DataFrame,
+    filter_candidates: pd.DataFrame,
+    diversity_gt3: pd.DataFrame,
+    position_counts: pd.DataFrame,
+    ring_sensitivity_table: pd.DataFrame,
+    hotspot_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+) -> str:
+    conservative_filter = filter_candidates.loc[
+        filter_candidates["drop_if_atom_count_lte"] == max(design_min_atoms - 2, 1)
+    ].iloc[0]
+    design_filter = filter_candidates.loc[
+        filter_candidates["drop_if_atom_count_lte"] == design_min_atoms - 1
+    ].iloc[0]
+    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20]
+    annotated_filtered = annotate_fragment_ring_counts(filtered_ring_df)
+    acyclic_gt3 = annotated_filtered[annotated_filtered["fragment_ring_count"] == 0].copy()
+    acyclic_diversity = build_position_diversity_table(acyclic_gt3) if not acyclic_gt3.empty else diversity_gt3.iloc[0:0].copy()
+
+    paper_ready_paragraph = (
+        f"在当前验证后的 MacrolactoneDB 片段库中，桥连或双锚点侧链不会进入当前片段库，因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。"
+        f"本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链；然而，许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains（带环单锚点侧链），"
+        f"这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反，当仅保留 >3 个重原子且进一步限定为非环状侧链后，位点排序转而更偏向 15、9、12 和 3 位，"
+        f"说明 13、3、4、12 的优势主要反映了天然带环侧链的富集，而 15 位则更接近药物化学中常见的非环状酰基修饰位点。"
+        f"因此，在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。"
+    )
+
+    lines = [
+        "# 大环内酯碎片库分析报告（中文）",
+        "",
+        "## 数据范围",
+        "",
+        f"- 当前验证后的可拼接碎片库包含 **{len(analysis_df):,}** 条片段记录，来源于 **{analysis_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
+        f"- 其中 {ring_size} 元环子集包含 **{len(ring_df):,}** 条片段记录，来源于 **{ring_df['source_parent_ml_id'].nunique():,}** 个母体分子。",
+        f"- 用于设计相关位点分析的严格子集定义为：片段重原子数 **>= {design_min_atoms}**。",
+        "",
+        "## 全库碎片大小结论",
+        "",
+        f"- 默认清洗阈值建议使用 `<= {design_min_atoms - 2}` 重原子删除。该阈值会删除 **{int(conservative_filter.removed_rows):,}** 条记录（{conservative_filter.removed_row_fraction:.1%}），但仅删除 **{int(conservative_filter.removed_unique_fragments):,}** 个唯一片段（{conservative_filter.removed_unique_fraction:.1%}）。",
+        f"- 若用于和你之前的分析口径对齐，则建议采用 `> {design_min_atoms - 1}` 重原子作为设计相关片段集合。此时会删除 **{int(design_filter.removed_rows):,}** 条记录（{design_filter.removed_row_fraction:.1%}）。",
+        "",
+        "## 16 元环位点结论",
+        "",
+        f"- 在 {ring_size} 元环中，保留所有可拼接片段时，天然侧链多样性较高的位置主要集中在 `13、3/4、12`，并且 6 位也显示出较强的设计相关多样性。",
+        f"- 在 `> {design_min_atoms - 1}` 重原子子集中，样本量足够且多样性较高的位点为：",
+        "",
+        "```text",
+        robust_gt3[
+            [
+                "cleavage_position",
+                "total_fragments",
+                "unique_fragments",
+                "normalized_shannon_entropy",
+                "mean_pairwise_tanimoto_distance",
+                "mean_atom_count",
+            ]
+        ]
+        .sort_values("normalized_shannon_entropy", ascending=False)
+        .head(8)
+        .to_string(index=False),
+        "```",
+        "",
+        "- 其中 6 位最能支持你的药化判断：它不是最高频位点，但在设计相关大侧链中显示出很高的结构多样性。",
+        "- 15 位则更偏向高频但低多样性的酰基修饰位点。",
+        "",
+        "## 桥环 / 稠环干扰的敏感性分析",
+        "",
+        "桥连或双锚点侧链不会进入当前片段库，因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说，真正的 bridge / fused multi-anchor components 已被代码层面排除。",
+        "",
+        "但是，需要额外区分另一类情况：**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环，因此会被保留下来，但片段自身可能包含糖环、杂环或其他环状骨架，仍然会显著影响位点多样性排名。",
+        "",
+        "敏感性分析结果如下：",
+        "",
+        "```text",
+        ring_sensitivity_table[
+            [
+                "cleavage_position",
+                "total_fragments_gt3",
+                "cyclic_fragment_rows",
+                "acyclic_fragment_rows",
+                "acyclic_row_fraction",
+                "cyclic_unique_fragments",
+                "acyclic_unique_fragments",
+            ]
+        ].to_string(index=False),
+        "```",
+        "",
+        "- `13` 位：`689/709` 条大侧链片段本身带环，说明该位点的高多样性主要由天然糖基/环状侧链驱动。",
+        "- `4` 位：`263/269` 条大侧链片段带环，几乎同样完全由环状侧链主导。",
+        "- `3` 位：`231/269` 条大侧链片段带环，带环片段占主导。",
+        "- `12` 位：环状与非环状侧链并存，但环状侧链仍占多数。",
+        "- `6` 位：多数保留的大侧链也带环，但仍表现出较强的多样性。",
+        "- `15` 位：几乎全部是非环状侧链，因此更接近半合成药化修饰位点的特征。",
+        "",
+        "## 仅看非环状侧链时的位点排序",
+        "",
+        f"- 当只保留 `> {design_min_atoms - 1}` 重原子且 **不含环** 的侧链时，位点排序明显变化：",
+        "",
+        "```text",
+        acyclic_diversity[
+            [
+                "cleavage_position",
+                "total_fragments",
+                "unique_fragments",
+                "normalized_shannon_entropy",
+                "mean_pairwise_tanimoto_distance",
+                "mean_atom_count",
+            ]
+        ]
+        .sort_values("total_fragments", ascending=False)
+        .head(8)
+        .to_string(index=False),
+        "```",
+        "",
+        "- 在这个更严格的口径下，`15` 位成为最主要的非环状侧链位点，随后是 `9、12、3` 位。",
+        "- 因此，`13、3、4、12` 的高天然多样性是真实存在的，但它主要表征的是**带环单锚点天然侧链**的富集，而不是桥连片段泄漏。",
+        "",
+        "## 论文可直接引用的中文讨论段落",
+        "",
+        paper_ready_paragraph,
+        "",
+        "## 建议的论文表述方式",
+        "",
+        "- 若讨论天然产物中的侧链多样性，可写为：`16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位，并在 6 位保留较强的设计相关多样性。`",
+        "- 若讨论药化半合成改造热点，可写为：`6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点，其中 6 和 15 位在数据库统计中分别对应高多样性和高频率信号，而 7 和 16 位更多体现为文献指导的探索性位点。`",
+        "- 若专门讨论非环状侧链设计，则应强调：`在排除 <=3 重原子小片段并进一步排除带环侧链后，15 位是最主要的非环状侧链修饰位点。`",
+        "",
+        "## 相关图表",
+        "",
+        "- 全库原子数分布：`fragment_atom_count_distribution.png`",
+        f"- {ring_size} 元环过滤前后位点数量对比：`ring{ring_size}_position_count_comparison.png`",
+        f"- {ring_size} 元环大侧链 boxplot：`ring{ring_size}_position_atom_count_boxplot_gt3.png`",
+        f"- {ring_size} 元环位点多样性图：`ring{ring_size}_position_diversity_gt3.png`",
+        f"- {ring_size} 元环桥环/带环侧链敏感性图：`ring{ring_size}_position_ring_sensitivity.png`",
+        f"- {ring_size} 元环药化热点对比图：`ring{ring_size}_medchem_hotspot_comparison.png`",
+        "",
+    ]
+    return "\n".join(lines)
+
+
+def build_summary_lines(
+    dataframe: pd.DataFrame,
+    ring_dataframe: pd.DataFrame,
+    filtered_ring_dataframe: pd.DataFrame,
+    filter_candidates: pd.DataFrame,
+    diversity_table: pd.DataFrame,
+    diversity_gt3: pd.DataFrame,
+    hotspot_table: pd.DataFrame,
+    ring_size: int,
+    design_min_atoms: int,
+) -> list[str]:
+    atom_counts = dataframe["fragment_atom_count"]
+    robust_gt3 = diversity_gt3[diversity_gt3["total_fragments"] >= 20] if not diversity_gt3.empty else diversity_gt3
+    lines = [
+        f"Analyzed rows: {len(dataframe)}",
+        f"Unique parent molecules: {dataframe['source_parent_ml_id'].nunique()}",
+        f"Unique fragment smiles: {dataframe['fragment_smiles_plain'].nunique()}",
+        f"Fragment atom count percentiles: p05={atom_counts.quantile(0.05):.1f}, p25={atom_counts.quantile(0.25):.1f}, p50={atom_counts.quantile(0.5):.1f}, p75={atom_counts.quantile(0.75):.1f}, p95={atom_counts.quantile(0.95):.1f}",
+        "",
+        "Filter candidates (drop fragments with atom_count <= threshold):",
+    ]
+
+    for candidate in filter_candidates.head(5).itertuples(index=False):
+        lines.append(
+            "  "
+            f"<= {candidate.drop_if_atom_count_lte}: "
+            f"remove {candidate.removed_rows} rows ({candidate.removed_row_fraction:.1%}), "
+            f"remove {candidate.removed_unique_fragments} unique fragments ({candidate.removed_unique_fraction:.1%})"
+        )
+
+    lines.extend(
+        [
+            "",
+            f"Ring {ring_size} rows: {len(ring_dataframe)}",
+            f"Ring {ring_size} unique fragment smiles: {ring_dataframe['fragment_smiles_plain'].nunique()}",
+            f"Ring {ring_size} rows with >= {design_min_atoms} heavy atoms: {len(filtered_ring_dataframe)}",
+            f"Ring {ring_size} unique fragment smiles with >= {design_min_atoms} heavy atoms: {filtered_ring_dataframe['fragment_smiles_plain'].nunique()}",
+            "",
+            f"Ring {ring_size} top positions by normalized Shannon entropy:",
+        ]
+    )
+
+    for row in diversity_table.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
+        lines.append(
+            "  "
+            f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
+            f"unique={row.unique_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
+        )
+
+    lines.extend(["", f"Ring {ring_size} top positions by mean pairwise Tanimoto distance:"])
+    for row in diversity_table.sort_values("mean_pairwise_tanimoto_distance", ascending=False).head(5).itertuples(index=False):
+        lines.append(
+            "  "
+            f"Position {row.cleavage_position}: distance={row.mean_pairwise_tanimoto_distance:.3f}, "
+            f"entropy={row.normalized_shannon_entropy:.3f}, atom_count_range={row.atom_count_range}"
+        )
+
+    if not robust_gt3.empty:
+        lines.extend(["", f"Ring {ring_size} top filtered positions by normalized Shannon entropy:"])
+        for row in robust_gt3.sort_values("normalized_shannon_entropy", ascending=False).head(5).itertuples(index=False):
+            lines.append(
+                "  "
+                f"Position {row.cleavage_position}: entropy={row.normalized_shannon_entropy:.3f}, "
+                f"unique={row.unique_fragments}, total={row.total_fragments}, mean_atom_count={row.mean_atom_count:.2f}"
+            )
+
+    lines.extend(["", "Medicinal-chemistry hotspot comparison:"])
+    for row in hotspot_table.itertuples(index=False):
+        lines.append(
+            "  "
+            f"Position {row.cleavage_position}: all={row.total_fragments_all}, "
+            f">={design_min_atoms} atoms={row.total_fragments_gt3}, "
+            f"unique_filtered={row.unique_fragments_gt3}, "
+            f"entropy_filtered={row.normalized_shannon_entropy_gt3:.3f}"
+        )
+
+    lines.extend(
+        [
+            "",
+            "Interpretation note: atom-count spread is only a coarse proxy for diversity.",
+            "Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.",
+            "For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.",
+        ]
+    )
+    return lines
+
+
+def main(argv: list[str] | None = None) -> None:
+    args = build_parser().parse_args(argv)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    medchem_positions = [int(part.strip()) for part in args.medchem_positions.split(",") if part.strip()]
+
+    sns.set_theme(style="whitegrid")
+
+    dataframe = load_fragment_library_dataset(args.input, args.db)
+    analysis_df = dataframe[
+        dataframe["splice_ready"].fillna(False) & (dataframe["classification"] == "standard_macrolactone")
+    ].copy()
+    ring_df = analysis_df[analysis_df["ring_size"] == args.ring_size].copy()
+    filtered_ring_df = ring_df[ring_df["fragment_atom_count"] >= args.design_min_atoms].copy()
+
+    if analysis_df.empty:
+        raise ValueError("No splice-ready standard macrolactone fragments available for analysis.")
+    if ring_df.empty:
+        raise ValueError(f"No splice-ready fragments found for ring size {args.ring_size}.")
+
+    atom_count_summary = build_fragment_atom_count_summary(analysis_df)
+    atom_count_frequency = build_fragment_atom_count_frequency_table(analysis_df)
+    filter_candidates = build_filter_candidate_table(analysis_df)
+    diversity_table = build_position_diversity_table(ring_df)
+    diversity_gt3 = (
+        build_position_diversity_table(filtered_ring_df) if not filtered_ring_df.empty else diversity_table.iloc[0:0].copy()
+    )
+    position_counts = build_position_count_comparison_table(ring_df, filtered_ring_df)
+    ring_sensitivity = build_ring_sensitivity_table(filtered_ring_df, position_counts)
+    hotspot_table = build_medchem_hotspot_table(medchem_positions, position_counts, diversity_table, diversity_gt3)
+
+    atom_count_summary.to_csv(output_dir / "fragment_atom_count_summary.csv", index=False)
+    atom_count_frequency.to_csv(output_dir / "fragment_atom_count_frequency.csv", index=False)
+    filter_candidates.to_csv(output_dir / "fragment_atom_count_filter_candidates.csv", index=False)
+    diversity_table.to_csv(output_dir / f"ring{args.ring_size}_position_diversity.csv", index=False)
+    diversity_gt3.to_csv(output_dir / f"ring{args.ring_size}_position_diversity_gt3.csv", index=False)
+    position_counts.to_csv(output_dir / f"ring{args.ring_size}_position_count_comparison.csv", index=False)
+    ring_sensitivity.to_csv(output_dir / f"ring{args.ring_size}_position_ring_sensitivity.csv", index=False)
+    hotspot_table.to_csv(output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.csv", index=False)
+
+    plot_fragment_atom_count_distribution(analysis_df, output_dir / "fragment_atom_count_distribution.png")
+    plot_ring_position_count_comparison(
+        position_counts,
+        args.ring_size,
+        args.design_min_atoms,
+        output_dir / f"ring{args.ring_size}_position_count_comparison.png",
+    )
+    plot_ring_position_atom_count_distribution(
+        ring_df, args.ring_size, output_dir / f"ring{args.ring_size}_position_atom_count_distribution.png"
+    )
+    plot_ring_position_atom_count_boxplot(
+        filtered_ring_df,
+        args.ring_size,
+        args.design_min_atoms,
+        output_dir / f"ring{args.ring_size}_position_atom_count_boxplot_gt3.png",
+    )
+    plot_ring_position_diversity(
+        diversity_table,
+        args.ring_size,
+        output_dir / f"ring{args.ring_size}_position_diversity.png",
+    )
+    plot_ring_position_diversity(
+        diversity_gt3,
+        args.ring_size,
+        output_dir / f"ring{args.ring_size}_position_diversity_gt3.png",
+        title_suffix=f"Side-Chain Diversity by Position (>= {args.design_min_atoms} Heavy Atoms)",
+    )
+    plot_ring_sensitivity(
+        ring_sensitivity,
+        args.ring_size,
+        args.design_min_atoms,
+        output_dir / f"ring{args.ring_size}_position_ring_sensitivity.png",
+    )
+    plot_medchem_hotspot_comparison(
+        hotspot_table,
+        args.ring_size,
+        args.design_min_atoms,
+        output_dir / f"ring{args.ring_size}_medchem_hotspot_comparison.png",
+    )
+
+    summary_lines = build_summary_lines(
+        analysis_df,
+        ring_df,
+        filtered_ring_df,
+        filter_candidates,
+        diversity_table,
+        diversity_gt3,
+        hotspot_table,
+        args.ring_size,
+        args.design_min_atoms,
+    )
+    (output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
+    report = build_markdown_report(
+        output_dir,
+        analysis_df,
+        ring_df,
+        filtered_ring_df,
+        filter_candidates,
+        diversity_table,
+        diversity_gt3,
+        position_counts,
+        ring_sensitivity,
+        hotspot_table,
+        args.ring_size,
+        args.design_min_atoms,
+    )
+    (output_dir / "fragment_library_analysis_report.md").write_text(report + "\n", encoding="utf-8")
+    report_zh = build_markdown_report_zh(
+        analysis_df,
+        ring_df,
+        filtered_ring_df,
+        filter_candidates,
+        diversity_gt3,
+        position_counts,
+        ring_sensitivity,
+        hotspot_table,
+        args.ring_size,
+        args.design_min_atoms,
+    )
+    (output_dir / "fragment_library_analysis_report_zh.md").write_text(report_zh + "\n", encoding="utf-8")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/macro_lactone_toolkit/validation/fragment_library_analysis.py
+++ b/src/macro_lactone_toolkit/validation/fragment_library_analysis.py
@@ -0,0 +1,183 @@
+from __future__ import annotations
+
+import math
+import sqlite3
+from functools import lru_cache
+from pathlib import Path
+
+import pandas as pd
+from rdkit import Chem, DataStructs
+from rdkit.Chem import rdFingerprintGenerator
+from rdkit.DataStructs.cDataStructs import ExplicitBitVect
+
+
+_FINGERPRINT_GENERATOR = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
+
+
+@lru_cache(maxsize=None)
+def _mol_from_smiles(fragment_smiles: str) -> Chem.Mol:
+    mol = Chem.MolFromSmiles(fragment_smiles)
+    if mol is None:
+        raise ValueError(f"Invalid fragment SMILES: {fragment_smiles}")
+    return mol
+
+
+@lru_cache(maxsize=None)
+def _fingerprint_from_smiles(fragment_smiles: str) -> ExplicitBitVect:
+    return _FINGERPRINT_GENERATOR.GetFingerprint(_mol_from_smiles(fragment_smiles))
+
+
+def count_non_dummy_atoms(fragment_smiles: str) -> int:
+    return int(_mol_from_smiles(fragment_smiles).GetNumHeavyAtoms())
+
+
+def annotate_fragment_atom_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
+    result = dataframe.copy()
+    atom_count_map = {
+        smiles: count_non_dummy_atoms(smiles)
+        for smiles in result["fragment_smiles_plain"].dropna().unique()
+    }
+    result["fragment_atom_count"] = result["fragment_smiles_plain"].map(atom_count_map).astype(int)
+    return result
+
+
+def load_fragment_library_dataset(input_path: str | Path, db_path: str | Path) -> pd.DataFrame:
+    fragments = pd.read_csv(input_path)
+
+    with sqlite3.connect(db_path) as connection:
+        parents = pd.read_sql_query(
+            """
+            SELECT ml_id, classification, ring_size, processing_status, num_sidechains
+            FROM parent_molecules
+            """,
+            connection,
+        )
+
+    merged = fragments.merge(
+        parents,
+        left_on="source_parent_ml_id",
+        right_on="ml_id",
+        how="left",
+        validate="many_to_one",
+    ).drop(columns=["ml_id"])
+
+    if merged[["classification", "ring_size"]].isna().any().any():
+        missing = merged[merged["ring_size"].isna() | merged["classification"].isna()]
+        sample_ids = sorted(missing["source_parent_ml_id"].dropna().unique().tolist())[:10]
+        raise ValueError(f"Missing parent metadata for fragment rows: {sample_ids}")
+
+    return annotate_fragment_atom_counts(merged)
+
+
+def build_fragment_atom_count_summary(dataframe: pd.DataFrame) -> pd.DataFrame:
+    atom_counts = dataframe["fragment_atom_count"]
+    summary = {
+        "rows": int(len(atom_counts)),
+        "unique_parent_molecules": int(dataframe["source_parent_ml_id"].nunique()),
+        "unique_fragment_smiles": int(dataframe["fragment_smiles_plain"].nunique()),
+        "min_atom_count": int(atom_counts.min()),
+        "p05_atom_count": float(atom_counts.quantile(0.05)),
+        "p25_atom_count": float(atom_counts.quantile(0.25)),
+        "median_atom_count": float(atom_counts.quantile(0.5)),
+        "mean_atom_count": float(atom_counts.mean()),
+        "p75_atom_count": float(atom_counts.quantile(0.75)),
+        "p95_atom_count": float(atom_counts.quantile(0.95)),
+        "max_atom_count": int(atom_counts.max()),
+    }
+    return pd.DataFrame([summary])
+
+
+def build_fragment_atom_count_frequency_table(dataframe: pd.DataFrame) -> pd.DataFrame:
+    row_counts = dataframe["fragment_atom_count"].value_counts().sort_index()
+    unique_fragment_counts = (
+        dataframe.drop_duplicates(subset=["fragment_smiles_plain"])["fragment_atom_count"]
+        .value_counts()
+        .sort_index()
+    )
+    frequency = pd.DataFrame({"fragment_atom_count": sorted(row_counts.index.union(unique_fragment_counts.index))})
+    frequency["row_count"] = frequency["fragment_atom_count"].map(row_counts).fillna(0).astype(int)
+    frequency["unique_fragment_count"] = (
+        frequency["fragment_atom_count"].map(unique_fragment_counts).fillna(0).astype(int)
+    )
+    frequency["row_fraction"] = frequency["row_count"] / max(int(len(dataframe)), 1)
+    frequency["unique_fragment_fraction"] = frequency["unique_fragment_count"] / max(
+        int(dataframe["fragment_smiles_plain"].nunique()), 1
+    )
+    return frequency
+
+
+def build_filter_candidate_table(dataframe: pd.DataFrame, max_threshold: int = 10) -> pd.DataFrame:
+    unique_atom_counts = dataframe.drop_duplicates(subset=["fragment_smiles_plain"])[
+        ["fragment_smiles_plain", "fragment_atom_count"]
+    ]
+    total_rows = int(len(dataframe))
+    total_unique = int(len(unique_atom_counts))
+    thresholds = range(1, min(max_threshold, int(dataframe["fragment_atom_count"].max())) + 1)
+
+    candidates: list[dict[str, float | int]] = []
+    for threshold in thresholds:
+        removed_rows = int((dataframe["fragment_atom_count"] <= threshold).sum())
+        removed_unique = int((unique_atom_counts["fragment_atom_count"] <= threshold).sum())
+        candidates.append(
+            {
+                "drop_if_atom_count_lte": threshold,
+                "removed_rows": removed_rows,
+                "removed_row_fraction": removed_rows / max(total_rows, 1),
+                "retained_rows": total_rows - removed_rows,
+                "retained_row_fraction": (total_rows - removed_rows) / max(total_rows, 1),
+                "removed_unique_fragments": removed_unique,
+                "removed_unique_fraction": removed_unique / max(total_unique, 1),
+                "retained_unique_fragments": total_unique - removed_unique,
+                "retained_unique_fraction": (total_unique - removed_unique) / max(total_unique, 1),
+            }
+        )
+    return pd.DataFrame(candidates)
+
+
+def _shannon_entropy(smiles_series: pd.Series) -> float:
+    probabilities = smiles_series.value_counts(normalize=True)
+    return float(-(probabilities * probabilities.map(math.log)).sum())
+
+
+def _mean_pairwise_tanimoto_distance(smiles_values: pd.Series) -> float:
+    unique_smiles = sorted(smiles_values.dropna().unique().tolist())
+    if len(unique_smiles) < 2:
+        return 0.0
+
+    fingerprints = [_fingerprint_from_smiles(smiles) for smiles in unique_smiles]
+    distances: list[float] = []
+    for index, fingerprint in enumerate(fingerprints[:-1]):
+        similarities = DataStructs.BulkTanimotoSimilarity(fingerprint, fingerprints[index + 1 :])
+        distances.extend(1.0 - similarity for similarity in similarities)
+    return float(sum(distances) / len(distances))
+
+
+def build_position_diversity_table(dataframe: pd.DataFrame) -> pd.DataFrame:
+    working = dataframe if "fragment_atom_count" in dataframe.columns else annotate_fragment_atom_counts(dataframe)
+    rows: list[dict[str, float | int]] = []
+
+    for cleavage_position, group in working.groupby("cleavage_position", sort=True):
+        unique_fragments = int(group["fragment_smiles_plain"].nunique())
+        entropy = _shannon_entropy(group["fragment_smiles_plain"])
+        atom_counts = group["fragment_atom_count"]
+        normalized_entropy = entropy / math.log(unique_fragments) if unique_fragments > 1 else 0.0
+        rows.append(
+            {
+                "cleavage_position": int(cleavage_position),
+                "total_fragments": int(len(group)),
+                "unique_fragments": unique_fragments,
+                "normalized_unique_ratio": unique_fragments / max(int(len(group)), 1),
+                "shannon_entropy": entropy,
+                "normalized_shannon_entropy": normalized_entropy,
+                "mean_pairwise_tanimoto_distance": _mean_pairwise_tanimoto_distance(group["fragment_smiles_plain"]),
+                "mean_atom_count": float(atom_counts.mean()),
+                "median_atom_count": float(atom_counts.median()),
+                "std_atom_count": float(atom_counts.std(ddof=0)),
+                "iqr_atom_count": float(atom_counts.quantile(0.75) - atom_counts.quantile(0.25)),
+                "min_atom_count": int(atom_counts.min()),
+                "max_atom_count": int(atom_counts.max()),
+                "atom_count_range": int(atom_counts.max() - atom_counts.min()),
+            }
+        )
+
+    return pd.DataFrame(rows).sort_values("cleavage_position").reset_index(drop=True)
--- a/tests/test_scripts_and_docs.py
+++ b/tests/test_scripts_and_docs.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import json
+import sqlite3
 import subprocess
 import sys
 from pathlib import Path
@@ -133,6 +134,159 @@ def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path):
    assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists()


+def test_analyze_validation_fragment_library_script_generates_reports(tmp_path):
+    input_path = tmp_path / "fragment_library.csv"
+    db_path = tmp_path / "fragments.db"
+    output_dir = tmp_path / "fragment_library_analysis"
+
+    pd.DataFrame(
+        [
+            {
+                "id": 1,
+                "source_type": "validation_extract",
+                "source_fragment_id": "ML16A_frag_0",
+                "source_parent_ml_id": "ML16A",
+                "source_parent_chembl_id": None,
+                "cleavage_position": 3,
+                "fragment_smiles_labeled": "[3*]C",
+                "fragment_smiles_plain": "*C",
+                "has_dummy_atom": True,
+                "dummy_atom_count": 1,
+                "splice_ready": True,
+                "original_bond_type": "SINGLE",
+                "created_at": "2026-03-19 00:00:00",
+            },
+            {
+                "id": 2,
+                "source_type": "validation_extract",
+                "source_fragment_id": "ML16A_frag_1",
+                "source_parent_ml_id": "ML16A",
+                "source_parent_chembl_id": None,
+                "cleavage_position": 3,
+                "fragment_smiles_labeled": "[3*]CC",
+                "fragment_smiles_plain": "*CC",
+                "has_dummy_atom": True,
+                "dummy_atom_count": 1,
+                "splice_ready": True,
+                "original_bond_type": "SINGLE",
+                "created_at": "2026-03-19 00:00:00",
+            },
+            {
+                "id": 3,
+                "source_type": "validation_extract",
+                "source_fragment_id": "ML16B_frag_0",
+                "source_parent_ml_id": "ML16B",
+                "source_parent_chembl_id": None,
+                "cleavage_position": 4,
+                "fragment_smiles_labeled": "[4*]O",
+                "fragment_smiles_plain": "*O",
+                "has_dummy_atom": True,
+                "dummy_atom_count": 1,
+                "splice_ready": True,
+                "original_bond_type": "SINGLE",
+                "created_at": "2026-03-19 00:00:00",
+            },
+            {
+                "id": 4,
+                "source_type": "validation_extract",
+                "source_fragment_id": "ML14A_frag_0",
+                "source_parent_ml_id": "ML14A",
+                "source_parent_chembl_id": None,
+                "cleavage_position": 3,
+                "fragment_smiles_labeled": "[3*]CCC",
+                "fragment_smiles_plain": "*CCC",
+                "has_dummy_atom": True,
+                "dummy_atom_count": 1,
+                "splice_ready": True,
+                "original_bond_type": "SINGLE",
+                "created_at": "2026-03-19 00:00:00",
+            },
+        ]
+    ).to_csv(input_path, index=False)
+
+    with sqlite3.connect(db_path) as connection:
+        connection.execute(
+            """
+            CREATE TABLE parent_molecules (
+                id INTEGER PRIMARY KEY,
+                ml_id TEXT NOT NULL,
+                chembl_id TEXT,
+                molecule_name TEXT,
+                smiles TEXT NOT NULL,
+                classification TEXT NOT NULL,
+                ring_size INTEGER,
+                primary_reason_code TEXT,
+                primary_reason_message TEXT,
+                processing_status TEXT NOT NULL,
+                error_message TEXT,
+                num_sidechains INTEGER,
+                cleavage_positions TEXT,
+                numbered_image_path TEXT,
+                created_at TEXT NOT NULL,
+                processed_at TEXT
+            )
+            """
+        )
+        connection.executemany(
+            """
+            INSERT INTO parent_molecules (
+                id, ml_id, chembl_id, molecule_name, smiles, classification, ring_size,
+                primary_reason_code, primary_reason_message, processing_status, error_message,
+                num_sidechains, cleavage_positions, numbered_image_path, created_at, processed_at
+            )
+            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            """,
+            [
+                (1, "ML16A", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 2, "[3]", None, "2026-03-19 00:00:00", None),
+                (2, "ML16B", None, None, "C1CCCCCCCCCCCCCC(=O)O1", "standard_macrolactone", 16, None, None, "success", None, 1, "[4]", None, "2026-03-19 00:00:00", None),
+                (3, "ML14A", None, None, "C1CCCCCCCCCCCC(=O)O1", "standard_macrolactone", 14, None, None, "success", None, 1, "[3]", None, "2026-03-19 00:00:00", None),
+            ],
+        )
+        connection.commit()
+
+    completed = run_script(
+        "analyze_validation_fragment_library.py",
+        "--input",
+        str(input_path),
+        "--db",
+        str(db_path),
+        "--output-dir",
+        str(output_dir),
+        "--ring-size",
+        "16",
+    )
+
+    assert completed.returncode == 0, completed.stderr
+    assert (output_dir / "fragment_atom_count_distribution.png").exists()
+    assert (output_dir / "fragment_atom_count_summary.csv").exists()
+    assert (output_dir / "fragment_atom_count_filter_candidates.csv").exists()
+    assert (output_dir / "ring16_position_count_comparison.csv").exists()
+    assert (output_dir / "ring16_position_count_comparison.png").exists()
+    assert (output_dir / "ring16_position_atom_count_distribution.png").exists()
+    assert (output_dir / "ring16_position_atom_count_boxplot_gt3.png").exists()
+    assert (output_dir / "ring16_position_diversity.csv").exists()
+    assert (output_dir / "ring16_position_diversity_gt3.csv").exists()
+    assert (output_dir / "ring16_position_diversity_gt3.png").exists()
+    assert (output_dir / "ring16_position_ring_sensitivity.csv").exists()
+    assert (output_dir / "ring16_position_ring_sensitivity.png").exists()
+    assert (output_dir / "ring16_medchem_hotspot_comparison.csv").exists()
+    assert (output_dir / "ring16_medchem_hotspot_comparison.png").exists()
+    assert (output_dir / "fragment_library_analysis_report.md").exists()
+    assert (output_dir / "fragment_library_analysis_report_zh.md").exists()
+    assert (output_dir / "analysis_summary.txt").exists()
+
+    diversity = pd.read_csv(output_dir / "ring16_position_diversity.csv")
+    assert set(diversity["cleavage_position"]) == {3, 4}
+    assert set(diversity["total_fragments"]) == {1, 2}
+
+    diversity_gt3 = pd.read_csv(output_dir / "ring16_position_diversity_gt3.csv")
+    assert diversity_gt3.empty
+
+    report_zh = (output_dir / "fragment_library_analysis_report_zh.md").read_text(encoding="utf-8")
+    assert "桥连或双锚点侧链不会进入当前片段库" in report_zh
+    assert "cyclic single-anchor side chains" in report_zh
+
+
 def test_active_text_assets_do_not_reference_legacy_api():
    forbidden_patterns = [
        "from src.",
--- a/tests/validation/test_fragment_library_analysis.py
+++ b/tests/validation/test_fragment_library_analysis.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from macro_lactone_toolkit.validation.fragment_library_analysis import (
+    build_position_diversity_table,
+    count_non_dummy_atoms,
+)
+
+
+def test_count_non_dummy_atoms_excludes_dummy_atoms() -> None:
+    assert count_non_dummy_atoms("*O") == 1
+    assert count_non_dummy_atoms("*C") == 1
+    assert count_non_dummy_atoms("*C(C)C") == 3
+
+
+def test_build_position_diversity_table_combines_frequency_and_structure_metrics() -> None:
+    dataframe = pd.DataFrame(
+        [
+            {"cleavage_position": 3, "fragment_smiles_plain": "*C"},
+            {"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
+            {"cleavage_position": 3, "fragment_smiles_plain": "*CC"},
+            {"cleavage_position": 3, "fragment_smiles_plain": "*O"},
+            {"cleavage_position": 4, "fragment_smiles_plain": "*C"},
+        ]
+    )
+
+    summary = build_position_diversity_table(dataframe).set_index("cleavage_position")
+
+    assert summary.loc[3, "total_fragments"] == 4
+    assert summary.loc[3, "unique_fragments"] == 3
+    assert summary.loc[3, "normalized_unique_ratio"] == pytest.approx(0.75)
+    assert summary.loc[3, "shannon_entropy"] > 0.0
+    assert summary.loc[3, "normalized_shannon_entropy"] > 0.0
+    assert summary.loc[3, "mean_pairwise_tanimoto_distance"] > 0.0
+
+    assert summary.loc[4, "total_fragments"] == 1
+    assert summary.loc[4, "unique_fragments"] == 1
+    assert summary.loc[4, "mean_pairwise_tanimoto_distance"] == 0.0
--- a/validation_output/analysis_plots.tar.gz
+++ b/validation_output/analysis_plots.tar.gz
--- a/validation_output/fragment_library.csv
+++ b/validation_output/fragment_library.csv
--- a/validation_output/fragment_library_analysis/analysis_summary.txt
+++ b/validation_output/fragment_library_analysis/analysis_summary.txt
@@ -0,0 +1,47 @@
+Analyzed rows: 34829
+Unique parent molecules: 4451
+Unique fragment smiles: 1852
+Fragment atom count percentiles: p05=1.0, p25=1.0, p50=1.0, p75=2.0, p95=14.0
+
+Filter candidates (drop fragments with atom_count <= threshold):
+  <= 1: remove 23994 rows (68.9%), remove 10 unique fragments (0.5%)
+  <= 2: remove 28069 rows (80.6%), remove 26 unique fragments (1.4%)
+  <= 3: remove 28550 rows (82.0%), remove 52 unique fragments (2.8%)
+  <= 4: remove 29045 rows (83.4%), remove 88 unique fragments (4.8%)
+  <= 5: remove 29272 rows (84.0%), remove 141 unique fragments (7.6%)
+
+Ring 16 rows: 8108
+Ring 16 unique fragment smiles: 596
+Ring 16 rows with >= 4 heavy atoms: 1880
+Ring 16 unique fragment smiles with >= 4 heavy atoms: 566
+
+Ring 16 top positions by normalized Shannon entropy:
+  Position 7: entropy=0.857, unique=4, mean_atom_count=2.57
+  Position 13: entropy=0.739, unique=198, mean_atom_count=15.50
+  Position 4: entropy=0.584, unique=70, mean_atom_count=6.89
+  Position 12: entropy=0.490, unique=99, mean_atom_count=3.63
+  Position 3: entropy=0.449, unique=121, mean_atom_count=5.10
+
+Ring 16 top positions by mean pairwise Tanimoto distance:
+  Position 16: distance=0.901, entropy=0.415, atom_count_range=12
+  Position 10: distance=0.871, entropy=0.077, atom_count_range=13
+  Position 7: distance=0.860, entropy=0.857, atom_count_range=9
+  Position 14: distance=0.848, entropy=0.375, atom_count_range=13
+  Position 12: distance=0.839, entropy=0.490, atom_count_range=20
+
+Ring 16 top filtered positions by normalized Shannon entropy:
+  Position 6: entropy=0.973, unique=60, total=89, mean_atom_count=12.58
+  Position 12: entropy=0.886, unique=83, total=177, mean_atom_count=10.00
+  Position 3: entropy=0.854, unique=117, total=269, mean_atom_count=15.41
+  Position 13: entropy=0.763, unique=193, total=709, mean_atom_count=18.91
+  Position 9: entropy=0.729, unique=37, total=141, mean_atom_count=7.82
+
+Medicinal-chemistry hotspot comparison:
+  Position 6: all=536, >=4 atoms=89, unique_filtered=60, entropy_filtered=0.973
+  Position 7: all=23, >=4 atoms=4, unique_filtered=1, entropy_filtered=0.000
+  Position 15: all=747, >=4 atoms=205, unique_filtered=8, entropy_filtered=0.456
+  Position 16: all=135, >=4 atoms=5, unique_filtered=5, entropy_filtered=1.000
+
+Interpretation note: atom-count spread is only a coarse proxy for diversity.
+Use entropy and fingerprint distance as primary diversity evidence; use atom-count spread as supporting context.
+For cyclic-side-chain sensitivity, see ring_sensitivity output and the markdown report.
--- a/validation_output/fragment_library_analysis/fragment_atom_count_filter_candidates.csv
+++ b/validation_output/fragment_library_analysis/fragment_atom_count_filter_candidates.csv
@@ -0,0 +1,11 @@
+drop_if_atom_count_lte,removed_rows,removed_row_fraction,retained_rows,retained_row_fraction,removed_unique_fragments,removed_unique_fraction,retained_unique_fragments,retained_unique_fraction
+1,23994,0.6889086680639697,10835,0.31109133193603034,10,0.005399568034557235,1842,0.9946004319654428
+2,28069,0.80590886904591,6760,0.19409113095408997,26,0.014038876889848811,1826,0.9859611231101512
+3,28550,0.8197191995176434,6279,0.18028080048235665,52,0.028077753779697623,1800,0.9719222462203023
+4,29045,0.8339314938700508,5784,0.16606850612994917,88,0.047516198704103674,1764,0.9524838012958964
+5,29272,0.8404490510781245,5557,0.15955094892187544,141,0.07613390928725702,1711,0.923866090712743
+6,29353,0.8427746992448821,5476,0.15722530075511787,188,0.10151187904967603,1664,0.8984881209503239
+7,29446,0.845444887880789,5383,0.154555112119211,248,0.13390928725701945,1604,0.8660907127429806
+8,29603,0.849952625685492,5226,0.15004737431450801,331,0.1787257019438445,1521,0.8212742980561555
+9,29812,0.8559533721898418,5017,0.1440466278101582,411,0.22192224622030238,1441,0.7780777537796977
+10,30146,0.8655430819144966,4683,0.13445691808550345,510,0.275377969762419,1342,0.724622030237581
--- a/validation_output/fragment_library_analysis/fragment_atom_count_frequency.csv
+++ b/validation_output/fragment_library_analysis/fragment_atom_count_frequency.csv
@@ -0,0 +1,46 @@
+fragment_atom_count,row_count,unique_fragment_count,row_fraction,unique_fragment_fraction
+1,23994,10,0.6889086680639697,0.005399568034557235
+2,4075,16,0.11700020098194033,0.008639308855291577
+3,481,26,0.013810330471733325,0.014038876889848811
+4,495,36,0.014212294352407477,0.019438444924406047
+5,227,53,0.006517557208073732,0.028617710583153346
+6,81,47,0.002325648166757587,0.025377969762419007
+7,93,60,0.0026701886359068593,0.032397408207343416
+8,157,83,0.004507737804702977,0.044816414686825054
+9,209,80,0.006000746504349824,0.04319654427645788
+10,334,99,0.009589709724654743,0.05345572354211663
+11,375,142,0.010766889660914755,0.07667386609071274
+12,2002,148,0.057480834936403574,0.07991360691144708
+13,382,100,0.01096787160125183,0.05399568034557235
+14,504,133,0.014470699704269431,0.07181425485961124
+15,230,97,0.006603692325361049,0.052375809935205186
+16,110,79,0.003158287633868328,0.04265658747300216
+17,122,71,0.0035028281030176005,0.03833693304535637
+18,128,81,0.003675098337592236,0.04373650107991361
+19,109,53,0.003129575928105889,0.028617710583153346
+20,30,26,0.0008613511728731804,0.014038876889848811
+21,56,48,0.0016078555226966035,0.02591792656587473
+22,24,21,0.0006890809382985443,0.011339092872570195
+23,137,42,0.0039335036894541904,0.02267818574514039
+24,32,29,0.000918774584398059,0.01565874730021598
+25,26,21,0.000746504349823423,0.011339092872570195
+26,50,36,0.0014355852881219673,0.019438444924406047
+27,69,29,0.001981107697608315,0.01565874730021598
+28,41,23,0.0011771799362600133,0.012419006479481642
+29,84,34,0.002411783284044905,0.0183585313174946
+30,30,21,0.0008613511728731804,0.011339092872570195
+31,18,13,0.0005168107037239083,0.007019438444924406
+32,33,23,0.0009474862901604985,0.012419006479481642
+33,14,10,0.0004019638806741509,0.005399568034557235
+34,11,9,0.0003158287633868328,0.004859611231101512
+35,14,10,0.0004019638806741509,0.005399568034557235
+36,8,7,0.00022969364609951476,0.003779697624190065
+37,8,8,0.00022969364609951476,0.004319654427645789
+38,8,7,0.00022969364609951476,0.003779697624190065
+39,14,9,0.0004019638806741509,0.004859611231101512
+40,2,2,5.742341152487869e-05,0.0010799136069114472
+41,2,2,5.742341152487869e-05,0.0010799136069114472
+43,2,2,5.742341152487869e-05,0.0010799136069114472
+44,2,2,5.742341152487869e-05,0.0010799136069114472
+45,3,3,8.613511728731804e-05,0.0016198704103671706
+48,3,1,8.613511728731804e-05,0.0005399568034557236
--- a/validation_output/fragment_library_analysis/fragment_atom_count_summary.csv
+++ b/validation_output/fragment_library_analysis/fragment_atom_count_summary.csv
@@ -0,0 +1,2 @@
+rows,unique_parent_molecules,unique_fragment_smiles,min_atom_count,p05_atom_count,p25_atom_count,median_atom_count,mean_atom_count,p75_atom_count,p95_atom_count,max_atom_count
+34829,4451,1852,1,1.0,1.0,1.0,3.3206523299549224,2.0,14.0,48
--- a/validation_output/fragment_library_analysis/fragment_library_analysis_report.md
+++ b/validation_output/fragment_library_analysis/fragment_library_analysis_report.md
@@ -0,0 +1,151 @@
+# Fragment Library Analysis Report
+
+## Scope and Dataset
+
+- Input fragment library: `validation_extract` rows merged with `parent_molecules` metadata.
+- Current validated design library contains **34,829** splice-ready fragment rows from **4,451** parent macrolactones.
+- The 16-membered subset contains **8,108** fragment rows from **1,105** parent molecules.
+- This dataset is narrower than the earlier broad workflow summary because it only keeps **splice-ready, single-anchor fragments** from the validated library. It should not be compared to prior total-fragment counts as if they were the same denominator.
+
+## Figure 1. Global Atom-Count Distribution
+
+![Fragment atom count distribution](fragment_atom_count_distribution.png)
+
+- Heavy-atom count is extremely right-skewed: p05=1.0, p25=1.0, median=1.0, p75=2.0, p95=14.0.
+- A conservative cleanup filter of `<= 2` heavy atoms removes **28,069** rows (80.6%) but only **26** unique fragment SMILES (1.4%).
+- A design-oriented filter aligned with your previous analysis, `<= 3` heavy atoms, removes **28,550** rows (82.0%) and **52** unique fragment SMILES (2.8%).
+- Interpretation: `<= 2` is the safer default for library cleanup, while `> 3` is useful for positional diversity analysis because it suppresses one- and two-atom noise.
+
+## Figure 2. Ring-Specific Counts Before and After Size Filtering
+
+![Ring 16 position count comparison](ring16_position_count_comparison.png)
+
+- This figure compares all splice-ready fragments with the design-oriented subset that keeps fragments with **>= 4 heavy atoms**.
+- After size filtering, the most populated 16-membered positions are:
+
+```text
+ cleavage_position  total_fragments_gt3  gt3_row_fraction
+                13                  709          0.809361
+                 3                  269          0.256679
+                 4                  269          0.452101
+                15                  205          0.274431
+                12                  177          0.190323
+                 9                  141          0.192098
+```
+
+## Figure 3. Atom-Count Spread for Design-Relevant Fragments
+
+![Ring 16 atom-count boxplot](ring16_position_atom_count_boxplot_gt3.png)
+
+- This boxplot only includes fragments with **>= 4 heavy atoms**.
+- Positions 13, 3, 4, 11 and 6 carry the broadest large-fragment size envelopes; position 15 remains common but its size spread is narrower, indicating repeated reuse of a small set of acyl-like substituents.
+
+## Figure 4. Position-Wise Diversity Metrics After Filtering
+
+![Ring 16 filtered diversity](ring16_position_diversity_gt3.png)
+
+- Diversity is evaluated with three complementary metrics:
+  1. `unique_fragments`: raw chemotype count.
+  2. `normalized_shannon_entropy`: how evenly those chemotypes are distributed.
+  3. `mean_pairwise_tanimoto_distance`: structural spread in Morgan fingerprint space.
+
+Top robust positions after removing <=3-heavy-atom fragments (`total_fragments >= 20`):
+
+```text
+ cleavage_position  total_fragments  unique_fragments  normalized_shannon_entropy  mean_pairwise_tanimoto_distance  mean_atom_count
+                 6               89                60                    0.973126                         0.780092        12.584270
+                12              177                83                    0.885717                         0.825769        10.000000
+                 3              269               117                    0.854464                         0.764966        15.405204
+                13              709               193                    0.763147                         0.565162        18.906911
+                 9              141                37                    0.729256                         0.804709         7.815603
+                 4              269                63                    0.585202                         0.701620        13.375465
+                15              205                 8                    0.455556                         0.769222         4.692683
+```
+
+- Within the filtered 16-membered set, **position 6** is the strongest support for your medicinal-chemistry hypothesis: it has moderate abundance (**89** rows) but high chemotype diversity and near-maximal entropy.
+- **Position 15** is also clearly relevant because it retains **205** filtered rows, but its diversity is narrow; the site is frequent, yet dominated by a few acyl motifs.
+
+## Figure 5. Focus on Medicinal-Chemistry Hotspots
+
+![Ring 16 medchem hotspot comparison](ring16_medchem_hotspot_comparison.png)
+
+This panel focuses on positions 6, 7, 15 and 16 because these are the literature-guided derivatization positions from tylosin / tilmicosin / tildipirosin / tulathromycin-like scaffold analysis.
+
+```text
+ cleavage_position  total_fragments_all  total_fragments_gt3  unique_fragments_gt3  normalized_shannon_entropy_gt3  mean_pairwise_tanimoto_distance_gt3
+                 6                  536                   89                    60                        0.973126                             0.780092
+                 7                   23                    4                     1                        0.000000                             0.000000
+                15                  747                  205                     8                        0.455556                             0.769222
+                16                  135                    5                     5                        1.000000                             0.891010
+```
+
+- Position 6: supported by the current database as a **design-relevant and structurally diverse** site.
+- Position 7: not supported as a prevalent natural hotspot in the current database; it appears only a few times after filtering, so it should be described as a **literature- or scaffold-guided modification site**, not a database-enriched site.
+- Position 15: supported as a **frequent modification site**, but the retained chemotypes are concentrated into a small number of acyl substituents.
+- Position 16: not prevalent in the current database, but the few retained fragments are structurally distinct singletons; this makes it a **low-evidence exploratory site**, not a high-confidence natural hotspot.
+
+## Figure 6. Are the Top Positions Driven by Ring-Bearing Side Chains?
+
+![Ring 16 ring sensitivity](ring16_position_ring_sensitivity.png)
+
+- Multi-anchor bridge or fused components do **not** enter the fragment library because the collector only keeps side-chain components with exactly one ring connection; see [src/macro_lactone_toolkit/_core.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/_core.py#L293) and [src/macro_lactone_toolkit/validation/validator.py](/Users/lingyuzeng/project/macro-lactone-sidechain-profiler/macro_split/src/macro_lactone_toolkit/validation/validator.py#L206).
+- What can still happen is that a retained fragment is a **single-anchor but ring-bearing side chain** such as a sugar, heterocycle or other cyclic appendage.
+
+```text
+ cleavage_position  total_fragments_gt3  cyclic_fragment_rows  acyclic_fragment_rows  acyclic_row_fraction  cyclic_unique_fragments  acyclic_unique_fragments
+                 3                  269                   231                     38              0.141264                       94                        23
+                 4                  269                   263                      6              0.022305                       57                         6
+                 5                    0                     0                      0              0.000000                        0                         0
+                 6                   89                    82                      7              0.078652                       54                         6
+                 7                    4                     0                      4              1.000000                        0                         1
+                 8                    0                     0                      0              0.000000                        0                         0
+                 9                  141                    61                     80              0.567376                       29                         8
+                10                    4                     4                      0              0.000000                        4                         0
+                11                    7                     0                      7              1.000000                        0                         5
+                12                  177                   116                     61              0.344633                       38                        45
+                13                  709                   689                     20              0.028209                      187                         6
+                14                    1                     1                      0              0.000000                        1                         0
+                15                  205                     4                    201              0.980488                        3                         5
+                16                    5                     4                      1              0.200000                        4                         1
+```
+
+- This shows that positions 13, 4, 3 and 6 are indeed dominated by **ring-bearing single-anchor side chains**, not by leaked bridge fragments.
+- Position 15 behaves differently: almost all retained `>3` fragments are **acyclic** acyl-like substituents.
+- Therefore, if your scientific question is specifically about non-cyclic side-chain diversification, the ranking should be recalculated on an acyclic-only subset.
+
+Acyclic-only ranking after removing <=3-heavy-atom fragments:
+
+```text
+ cleavage_position  total_fragments  unique_fragments  normalized_shannon_entropy  mean_pairwise_tanimoto_distance  mean_atom_count
+                15              201                 5                    0.526528                         0.633544         4.562189
+                 9               80                 8                    0.527036                         0.607697         4.387500
+                12               61                45                    0.976171                         0.792308         8.409836
+                 3               38                23                    0.949162                         0.677847        12.236842
+                13               20                 6                    0.900676                         0.749984         7.500000
+                 6                7                 6                    0.975504                         0.591713         7.571429
+                11                7                 5                    0.962961                         0.695245        14.857143
+                 4                6                 6                    1.000000                         0.730513         5.500000
+```
+
+- Under this stricter acyclic-only view, position 15 becomes the dominant site, followed by 9, 12 and 3. Positions 13 and 4 no longer dominate, confirming that their earlier prominence is mainly driven by cyclic side chains.
+
+## Reconciliation With the Previous Conclusion
+
+The earlier statement that `6,7,15,16` are important 16-membered macrolide modification positions is **not invalidated** by the current analysis, but it must be phrased carefully because it answers a different question.
+
+- The **previous conclusion** came from scaffold-centric medicinal chemistry on known 16-membered macrolides and identifies **synthetically exploited modification positions**.
+- The **current MacrolactoneDB analysis** measures **natural side-chain occurrence and diversity** in a validated fragment library.
+- These are related but not identical concepts. A site can be medicinally attractive even if it is rare in natural products, and a site can be naturally diverse without being the preferred position for semisynthetic optimization.
+
+### Recommended paper-safe wording
+
+> In the validated MacrolactoneDB fragment library, natural side-chain diversity of 16-membered macrolactones is concentrated primarily at positions 13, 3/4 and 12. After excluding fragments with <=3 heavy atoms to focus on design-relevant substituents, position 6 remains strongly diversity-enriched and position 15 remains frequency-enriched, whereas positions 7 and 16 are sparse and should be interpreted as literature-guided derivatization sites rather than statistically dominant natural hotspots.
+
+### Practical interpretation for fragment-library design
+
+- Use `<= 2` heavy atoms as the **default cleanup filter** for the production fragment library.
+- Use the stricter `> 3` heavy-atom subset when discussing **position-wise diversity** or aligning with the previous exploratory analysis.
+- For 16-membered macrolide design, prioritize positions **13, 3, 4, 12 and 6** for natural-diversity-driven fragment mining.
+- Keep positions **15** as a targeted acyl-modification site even though its chemotype diversity is narrower.
+- Treat positions **7 and 16** as hypothesis-driven medicinal chemistry positions that need literature or synthesis justification beyond database prevalence.
+
--- a/validation_output/fragment_library_analysis/fragment_library_analysis_report_zh.md
+++ b/validation_output/fragment_library_analysis/fragment_library_analysis_report_zh.md
@@ -0,0 +1,103 @@
+# 大环内酯碎片库分析报告（中文）
+
+## 数据范围
+
+- 当前验证后的可拼接碎片库包含 **34,829** 条片段记录，来源于 **4,451** 个母体分子。
+- 其中 16 元环子集包含 **8,108** 条片段记录，来源于 **1,105** 个母体分子。
+- 用于设计相关位点分析的严格子集定义为：片段重原子数 **>= 4**。
+
+## 全库碎片大小结论
+
+- 默认清洗阈值建议使用 `<= 2` 重原子删除。该阈值会删除 **28,069** 条记录（80.6%），但仅删除 **26** 个唯一片段（1.4%）。
+- 若用于和你之前的分析口径对齐，则建议采用 `> 3` 重原子作为设计相关片段集合。此时会删除 **28,550** 条记录（82.0%）。
+
+## 16 元环位点结论
+
+- 在 16 元环中，保留所有可拼接片段时，天然侧链多样性较高的位置主要集中在 `13、3/4、12`，并且 6 位也显示出较强的设计相关多样性。
+- 在 `> 3` 重原子子集中，样本量足够且多样性较高的位点为：
+
+```text
+ cleavage_position  total_fragments  unique_fragments  normalized_shannon_entropy  mean_pairwise_tanimoto_distance  mean_atom_count
+                 6               89                60                    0.973126                         0.780092        12.584270
+                12              177                83                    0.885717                         0.825769        10.000000
+                 3              269               117                    0.854464                         0.764966        15.405204
+                13              709               193                    0.763147                         0.565162        18.906911
+                 9              141                37                    0.729256                         0.804709         7.815603
+                 4              269                63                    0.585202                         0.701620        13.375465
+                15              205                 8                    0.455556                         0.769222         4.692683
+```
+
+- 其中 6 位最能支持你的药化判断：它不是最高频位点，但在设计相关大侧链中显示出很高的结构多样性。
+- 15 位则更偏向高频但低多样性的酰基修饰位点。
+
+## 桥环 / 稠环干扰的敏感性分析
+
+桥连或双锚点侧链不会进入当前片段库，因为断裂逻辑只保留与主环存在 **1 个连接点** 的侧链组件。也就是说，真正的 bridge / fused multi-anchor components 已被代码层面排除。
+
+但是，需要额外区分另一类情况：**cyclic single-anchor side chains**。这类片段虽然只在一个位置连到主环，因此会被保留下来，但片段自身可能包含糖环、杂环或其他环状骨架，仍然会显著影响位点多样性排名。
+
+敏感性分析结果如下：
+
+```text
+ cleavage_position  total_fragments_gt3  cyclic_fragment_rows  acyclic_fragment_rows  acyclic_row_fraction  cyclic_unique_fragments  acyclic_unique_fragments
+                 3                  269                   231                     38              0.141264                       94                        23
+                 4                  269                   263                      6              0.022305                       57                         6
+                 5                    0                     0                      0              0.000000                        0                         0
+                 6                   89                    82                      7              0.078652                       54                         6
+                 7                    4                     0                      4              1.000000                        0                         1
+                 8                    0                     0                      0              0.000000                        0                         0
+                 9                  141                    61                     80              0.567376                       29                         8
+                10                    4                     4                      0              0.000000                        4                         0
+                11                    7                     0                      7              1.000000                        0                         5
+                12                  177                   116                     61              0.344633                       38                        45
+                13                  709                   689                     20              0.028209                      187                         6
+                14                    1                     1                      0              0.000000                        1                         0
+                15                  205                     4                    201              0.980488                        3                         5
+                16                    5                     4                      1              0.200000                        4                         1
+```
+
+- `13` 位：`689/709` 条大侧链片段本身带环，说明该位点的高多样性主要由天然糖基/环状侧链驱动。
+- `4` 位：`263/269` 条大侧链片段带环，几乎同样完全由环状侧链主导。
+- `3` 位：`231/269` 条大侧链片段带环，带环片段占主导。
+- `12` 位：环状与非环状侧链并存，但环状侧链仍占多数。
+- `6` 位：多数保留的大侧链也带环，但仍表现出较强的多样性。
+- `15` 位：几乎全部是非环状侧链，因此更接近半合成药化修饰位点的特征。
+
+## 仅看非环状侧链时的位点排序
+
+- 当只保留 `> 3` 重原子且 **不含环** 的侧链时，位点排序明显变化：
+
+```text
+ cleavage_position  total_fragments  unique_fragments  normalized_shannon_entropy  mean_pairwise_tanimoto_distance  mean_atom_count
+                15              201                 5                    0.526528                         0.633544         4.562189
+                 9               80                 8                    0.527036                         0.607697         4.387500
+                12               61                45                    0.976171                         0.792308         8.409836
+                 3               38                23                    0.949162                         0.677847        12.236842
+                13               20                 6                    0.900676                         0.749984         7.500000
+                 6                7                 6                    0.975504                         0.591713         7.571429
+                11                7                 5                    0.962961                         0.695245        14.857143
+                 4                6                 6                    1.000000                         0.730513         5.500000
+```
+
+- 在这个更严格的口径下，`15` 位成为最主要的非环状侧链位点，随后是 `9、12、3` 位。
+- 因此，`13、3、4、12` 的高天然多样性是真实存在的，但它主要表征的是**带环单锚点天然侧链**的富集，而不是桥连片段泄漏。
+
+## 论文可直接引用的中文讨论段落
+
+在当前验证后的 MacrolactoneDB 片段库中，桥连或双锚点侧链不会进入当前片段库，因此 16 元大环内酯位点排序的变化并不是由桥环片段误纳入所致。本研究的断裂规则仅保留与主环具有单一连接点的可拼接侧链；然而，许多被保留的大侧链本身仍可包含糖环、杂环或稠合环等 cyclic single-anchor side chains（带环单锚点侧链），这会显著抬高 13、3、4、12 以及 6 位的天然侧链多样性统计。相反，当仅保留 >3 个重原子且进一步限定为非环状侧链后，位点排序转而更偏向 15、9、12 和 3 位，说明 13、3、4、12 的优势主要反映了天然带环侧链的富集，而 15 位则更接近药物化学中常见的非环状酰基修饰位点。因此，在论文讨论中应明确区分“天然带环侧链多样性热点”和“非环状半合成修饰热点”这两类位点概念。
+
+## 建议的论文表述方式
+
+- 若讨论天然产物中的侧链多样性，可写为：`16 元大环内酯的天然侧链多样性主要集中在 13、3/4 和 12 位，并在 6 位保留较强的设计相关多样性。`
+- 若讨论药化半合成改造热点，可写为：`6、7、15、16 位代表文献和先导化合物研究中优先使用的衍生化位点，其中 6 和 15 位在数据库统计中分别对应高多样性和高频率信号，而 7 和 16 位更多体现为文献指导的探索性位点。`
+- 若专门讨论非环状侧链设计，则应强调：`在排除 <=3 重原子小片段并进一步排除带环侧链后，15 位是最主要的非环状侧链修饰位点。`
+
+## 相关图表
+
+- 全库原子数分布：`fragment_atom_count_distribution.png`
+- 16 元环过滤前后位点数量对比：`ring16_position_count_comparison.png`
+- 16 元环大侧链 boxplot：`ring16_position_atom_count_boxplot_gt3.png`
+- 16 元环位点多样性图：`ring16_position_diversity_gt3.png`
+- 16 元环桥环/带环侧链敏感性图：`ring16_position_ring_sensitivity.png`
+- 16 元环药化热点对比图：`ring16_medchem_hotspot_comparison.png`
+
--- a/validation_output/fragment_library_analysis/ring16_medchem_hotspot_comparison.csv
+++ b/validation_output/fragment_library_analysis/ring16_medchem_hotspot_comparison.csv
@@ -0,0 +1,5 @@
+cleavage_position,total_fragments_all,unique_fragments_all,mean_atom_count_all,total_fragments_gt3,unique_fragments_gt3,mean_atom_count_gt3,gt3_row_fraction,gt3_unique_fraction,normalized_shannon_entropy_all,mean_pairwise_tanimoto_distance_all,normalized_shannon_entropy_gt3,mean_pairwise_tanimoto_distance_gt3
+6,536,68,2.9402985074626864,89,60,12.584269662921349,0.166044776119403,0.8823529411764706,0.30950388972503856,0.8133239394372517,0.9731262721536741,0.7800916405422387
+7,23,4,2.5652173913043477,4,1,10.0,0.17391304347826086,0.25,0.8567985762221871,0.86,0.0,0.0
+15,747,13,2.0174029451137887,205,8,4.692682926829268,0.2744310575635877,0.6153846153846154,0.3487511597393803,0.8168500968741627,0.45555578681139725,0.7692219308030589
+16,135,8,1.9851851851851852,5,5,8.4,0.037037037037037035,0.625,0.4148320683071004,0.9006253315127155,1.0000000000000002,0.8910101403362273
--- a/validation_output/fragment_library_analysis/ring16_position_count_comparison.csv
+++ b/validation_output/fragment_library_analysis/ring16_position_count_comparison.csv
@@ -0,0 +1,15 @@
+cleavage_position,total_fragments_all,unique_fragments_all,mean_atom_count_all,total_fragments_gt3,unique_fragments_gt3,mean_atom_count_gt3,gt3_row_fraction,gt3_unique_fraction
+3,1048,121,5.097328244274809,269,117,15.405204460966543,0.2566793893129771,0.9669421487603306
+4,595,70,6.894117647058824,269,63,13.37546468401487,0.45210084033613446,0.9
+5,54,2,1.0,0,0,0.0,0.0,0.0
+6,536,68,2.9402985074626864,89,60,12.584269662921349,0.166044776119403,0.8823529411764706
+7,23,4,2.5652173913043477,4,1,10.0,0.17391304347826086,0.25
+8,123,4,1.016260162601626,0,0,0.0,0.0,0.0
+9,734,41,2.310626702997275,141,37,7.815602836879433,0.19209809264305178,0.9024390243902439
+10,993,7,1.0433031218529707,4,4,11.5,0.004028197381671702,0.5714285714285714
+11,249,8,1.3895582329317269,7,5,14.857142857142858,0.028112449799196786,0.625
+12,930,99,3.6268817204301076,177,83,10.0,0.19032258064516128,0.8383838383838383
+13,876,198,15.501141552511415,709,193,18.90691114245416,0.8093607305936074,0.9747474747474747
+14,1065,6,1.267605633802817,1,1,14.0,0.0009389671361502347,0.16666666666666666
+15,747,13,2.0174029451137887,205,8,4.692682926829268,0.2744310575635877,0.6153846153846154
+16,135,8,1.9851851851851852,5,5,8.4,0.037037037037037035,0.625
--- a/validation_output/fragment_library_analysis/ring16_position_diversity.csv
+++ b/validation_output/fragment_library_analysis/ring16_position_diversity.csv
@@ -0,0 +1,15 @@
+cleavage_position,total_fragments,unique_fragments,normalized_unique_ratio,shannon_entropy,normalized_shannon_entropy,mean_pairwise_tanimoto_distance,mean_atom_count,median_atom_count,std_atom_count,iqr_atom_count,min_atom_count,max_atom_count,atom_count_range
+3,1048,121,0.11545801526717557,2.1513093369119027,0.4485828387328402,0.7727580364590483,5.097328244274809,2.0,7.125515417214252,5.0,1,39,38
+4,595,70,0.11764705882352941,2.4817848918166048,0.584156213064148,0.7382343170047687,6.894117647058824,2.0,6.017946467329047,12.5,1,18,17
+5,54,2,0.037037037037037035,0.0922160573371918,0.13303964861069892,0.8,1.0,1.0,0.0,0.0,1,1,0
+6,536,68,0.12686567164179105,1.3059540474767763,0.30950388972503856,0.8133239394372517,2.9402985074626864,1.0,4.444116006032291,0.0,1,20,19
+7,23,4,0.17391304347826086,1.1877750348323688,0.8567985762221871,0.86,2.5652173913043477,1.0,3.4113122166840055,0.0,1,10,9
+8,123,4,0.032520325203252036,0.2372288446747181,0.171124438884017,0.7915343915343915,1.016260162601626,1.0,0.1795993661331262,0.0,1,3,2
+9,734,41,0.055858310626702996,1.472202321733401,0.39643833357458974,0.8284402664223979,2.310626702997275,1.0,3.280936828670794,0.0,1,17,16
+10,993,7,0.007049345417925478,0.14975384152906807,0.07695825092529042,0.8712928851639762,1.0433031218529707,1.0,0.6755237166842796,0.0,1,14,13
+11,249,8,0.0321285140562249,0.41116030451725305,0.1977263107791457,0.8041654723607728,1.3895582329317269,1.0,3.5978646341022595,0.0,1,41,40
+12,930,99,0.1064516129032258,2.251679166978565,0.49001532939616665,0.8394325303461457,3.6268817204301076,3.0,3.4290862065303043,2.0,1,21,20
+13,876,198,0.22602739726027396,3.9101740989981857,0.7394055701617327,0.5841396079985519,15.501141552511415,13.0,9.597433474135945,11.0,1,36,35
+14,1065,6,0.005633802816901409,0.6717715558650733,0.3749228439431623,0.848458035826457,1.267605633802817,1.0,0.5852108438838486,1.0,1,14,13
+15,747,13,0.01740294511378849,0.8945290630874894,0.3487511597393803,0.8168500968741627,2.0174029451137887,1.0,1.7641278118940684,3.0,1,13,12
+16,135,8,0.05925925925925926,0.8626190356587518,0.4148320683071004,0.9006253315127155,1.9851851851851852,2.0,1.4141359627921224,0.5,1,13,12
--- a/validation_output/fragment_library_analysis/ring16_position_diversity_gt3.csv
+++ b/validation_output/fragment_library_analysis/ring16_position_diversity_gt3.csv
@@ -0,0 +1,13 @@
+cleavage_position,total_fragments,unique_fragments,normalized_unique_ratio,shannon_entropy,normalized_shannon_entropy,mean_pairwise_tanimoto_distance,mean_atom_count,median_atom_count,std_atom_count,iqr_atom_count,min_atom_count,max_atom_count,atom_count_range
+3,269,117,0.4349442379182156,4.069106586040983,0.8544640833690577,0.7649661414788775,15.405204460966543,14.0,7.356263980699357,10.0,4,39,35
+4,269,63,0.2342007434944238,2.424572263644206,0.5852023706107886,0.7016204220824205,13.37546468401487,14.0,1.7682432704870303,0.0,5,18,13
+6,89,60,0.6741573033707865,3.984314260747859,0.9731262721536741,0.7800916405422387,12.584269662921349,13.0,2.7016806619569276,3.0,5,20,15
+7,4,1,0.25,-0.0,0.0,0.0,10.0,10.0,0.0,0.0,10,10,0
+9,141,37,0.2624113475177305,2.633283400210265,0.7292559576027439,0.8047085590557639,7.815602836879433,5.0,4.30339275172448,7.0,4,17,13
+10,4,4,1.0,1.3862943611198906,1.0,0.7794154494906375,11.5,11.5,1.8027756377319946,2.0,9,14,5
+11,7,5,0.7142857142857143,1.5498260458782016,0.9629610647945144,0.6952446898083474,14.857142857142858,4.0,16.54801301346713,19.5,4,41,37
+12,177,83,0.4689265536723164,3.913840989343892,0.8857167154747141,0.8257691691480503,10.0,10.0,2.788191402941166,4.0,4,21,17
+13,709,193,0.27221438645980256,4.016208118657202,0.7631473589542491,0.5651618154111845,18.90691114245416,15.0,7.276898560461122,13.0,4,36,32
+14,1,1,1.0,-0.0,0.0,0.0,14.0,14.0,0.0,0.0,14,14,0
+15,205,8,0.03902439024390244,0.9473016276482624,0.45555578681139725,0.7692219308030589,4.692682926829268,5.0,1.2049471689465932,1.0,4,13,9
+16,5,5,1.0,1.6094379124341005,1.0000000000000002,0.8910101403362273,8.4,7.0,2.4979991993593593,2.0,6,13,7
--- a/validation_output/fragment_library_analysis/ring16_position_ring_sensitivity.csv
+++ b/validation_output/fragment_library_analysis/ring16_position_ring_sensitivity.csv
@@ -0,0 +1,15 @@
+cleavage_position,total_fragments_gt3,cyclic_fragment_rows,acyclic_fragment_rows,cyclic_row_fraction,acyclic_row_fraction,unique_fragments_gt3,cyclic_unique_fragments,acyclic_unique_fragments
+3,269,231,38,0.8587360594795539,0.1412639405204461,117,94,23
+4,269,263,6,0.9776951672862454,0.022304832713754646,63,57,6
+5,0,0,0,0.0,0.0,0,0,0
+6,89,82,7,0.9213483146067416,0.07865168539325842,60,54,6
+7,4,0,4,0.0,1.0,1,0,1
+8,0,0,0,0.0,0.0,0,0,0
+9,141,61,80,0.4326241134751773,0.5673758865248227,37,29,8
+10,4,4,0,1.0,0.0,4,4,0
+11,7,0,7,0.0,1.0,5,0,5
+12,177,116,61,0.655367231638418,0.3446327683615819,83,38,45
+13,709,689,20,0.9717912552891397,0.028208744710860368,193,187,6
+14,1,1,0,1.0,0.0,1,1,0
+15,205,4,201,0.01951219512195122,0.9804878048780488,8,3,5
+16,5,4,1,0.8,0.2,5,4,1