from __future__ import annotations import argparse import json from collections import Counter from pathlib import Path import pandas as pd from rdkit import Chem from rdkit.Chem import AllChem def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description="Generate cleavage statistics and ETKDGv3 parent-molecule SDF files from flat fragment CSV." ) parser.add_argument("--input", required=True) parser.add_argument("--output-dir", required=True) return parser def main(argv: list[str] | None = None) -> None: args = build_parser().parse_args(argv) dataframe = pd.read_csv(args.input) output_dir = Path(args.output_dir) sdf_dir = output_dir / "sdf" output_dir.mkdir(parents=True, exist_ok=True) sdf_dir.mkdir(parents=True, exist_ok=True) position_counts = Counter(int(position) for position in dataframe["cleavage_position"]) stats = { "position_counts": dict(sorted(position_counts.items())), "total_fragments": int(len(dataframe)), "total_parent_molecules": int(dataframe["parent_id"].nunique()), } (output_dir / "cleavage_position_statistics.json").write_text( json.dumps(stats, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) parent_rows = dataframe[["parent_id", "parent_smiles"]].drop_duplicates() for parent in parent_rows.itertuples(index=False): mol = Chem.MolFromSmiles(parent.parent_smiles) if mol is None: continue mol = Chem.AddHs(mol) params = AllChem.ETKDGv3() if AllChem.EmbedMolecule(mol, params) != 0: continue AllChem.UFFOptimizeMolecule(mol) writer = Chem.SDWriter(str(sdf_dir / f"{parent.parent_id}_3d.sdf")) writer.write(mol) writer.close() if __name__ == "__main__": main()