from __future__ import annotations import argparse from pathlib import Path import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import pandas as pd def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Analyze flat fragment CSV output and generate reports.") parser.add_argument("--input", required=True) parser.add_argument("--output-dir", required=True) return parser def main(argv: list[str] | None = None) -> None: args = build_parser().parse_args(argv) dataframe = pd.read_csv(args.input) output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) position_stats = ( dataframe.groupby("cleavage_position") .agg( total_count=("fragment_id", "size"), unique_fragments=("fragment_smiles_plain", "nunique"), mean_atom_count=("atom_count", "mean"), mean_molecular_weight=("molecular_weight", "mean"), ) .reset_index() .sort_values("cleavage_position") ) position_stats.to_csv(output_dir / "position_statistics.csv", index=False) property_summary = pd.DataFrame( [ { "unique_parents": dataframe["parent_id"].nunique(), "total_fragments": len(dataframe), "unique_fragments": dataframe["fragment_smiles_plain"].nunique(), "mean_atom_count": dataframe["atom_count"].mean(), "mean_molecular_weight": dataframe["molecular_weight"].mean(), } ] ) property_summary.to_csv(output_dir / "fragment_property_summary.csv", index=False) figure, axis = plt.subplots(figsize=(10, 6)) axis.bar(position_stats["cleavage_position"], position_stats["total_count"], color="steelblue") axis.set_xlabel("Cleavage Position") axis.set_ylabel("Fragment Count") axis.set_title("Fragment Frequency by Cleavage Position") axis.grid(axis="y", alpha=0.3) figure.tight_layout() figure.savefig(output_dir / "position_frequencies.png", dpi=300, bbox_inches="tight") plt.close(figure) summary_lines = [ f"Input file: {args.input}", f"Rows: {len(dataframe)}", f"Unique parent molecules: {dataframe['parent_id'].nunique()}", f"Unique fragments: {dataframe['fragment_smiles_plain'].nunique()}", f"Most frequent cleavage position: {int(position_stats.sort_values('total_count', ascending=False).iloc[0]['cleavage_position'])}", ] (output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8") if __name__ == "__main__": main()