Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
75 lines
2.6 KiB
Python
75 lines
2.6 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import matplotlib
|
|
|
|
matplotlib.use("Agg")
|
|
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(description="Analyze flat fragment CSV output and generate reports.")
|
|
parser.add_argument("--input", required=True)
|
|
parser.add_argument("--output-dir", required=True)
|
|
return parser
|
|
|
|
|
|
def main(argv: list[str] | None = None) -> None:
|
|
args = build_parser().parse_args(argv)
|
|
dataframe = pd.read_csv(args.input)
|
|
output_dir = Path(args.output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
position_stats = (
|
|
dataframe.groupby("cleavage_position")
|
|
.agg(
|
|
total_count=("fragment_id", "size"),
|
|
unique_fragments=("fragment_smiles_plain", "nunique"),
|
|
mean_atom_count=("atom_count", "mean"),
|
|
mean_molecular_weight=("molecular_weight", "mean"),
|
|
)
|
|
.reset_index()
|
|
.sort_values("cleavage_position")
|
|
)
|
|
position_stats.to_csv(output_dir / "position_statistics.csv", index=False)
|
|
|
|
property_summary = pd.DataFrame(
|
|
[
|
|
{
|
|
"unique_parents": dataframe["parent_id"].nunique(),
|
|
"total_fragments": len(dataframe),
|
|
"unique_fragments": dataframe["fragment_smiles_plain"].nunique(),
|
|
"mean_atom_count": dataframe["atom_count"].mean(),
|
|
"mean_molecular_weight": dataframe["molecular_weight"].mean(),
|
|
}
|
|
]
|
|
)
|
|
property_summary.to_csv(output_dir / "fragment_property_summary.csv", index=False)
|
|
|
|
figure, axis = plt.subplots(figsize=(10, 6))
|
|
axis.bar(position_stats["cleavage_position"], position_stats["total_count"], color="steelblue")
|
|
axis.set_xlabel("Cleavage Position")
|
|
axis.set_ylabel("Fragment Count")
|
|
axis.set_title("Fragment Frequency by Cleavage Position")
|
|
axis.grid(axis="y", alpha=0.3)
|
|
figure.tight_layout()
|
|
figure.savefig(output_dir / "position_frequencies.png", dpi=300, bbox_inches="tight")
|
|
plt.close(figure)
|
|
|
|
summary_lines = [
|
|
f"Input file: {args.input}",
|
|
f"Rows: {len(dataframe)}",
|
|
f"Unique parent molecules: {dataframe['parent_id'].nunique()}",
|
|
f"Unique fragments: {dataframe['fragment_smiles_plain'].nunique()}",
|
|
f"Most frequent cleavage position: {int(position_stats.sort_values('total_count', ascending=False).iloc[0]['cleavage_position'])}",
|
|
]
|
|
(output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|