Files
macro_split/scripts/generate_sdf_and_statistics.py
lingyuzeng c0ead42384 feat(toolkit): add classification and migration
Implement the standard/non-standard/not-macrolactone classification layer
and integrate it into analyzer, fragmenter, and CLI outputs.

Port the remaining legacy package capabilities into new visualization and
workflow modules, restore batch/statistics/SDF scripts on top of the flat
CSV workflow, and update active docs to the new package API.
2026-03-18 23:56:41 +08:00

58 lines
1.8 KiB
Python

from __future__ import annotations
import argparse
import json
from collections import Counter
from pathlib import Path
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Generate cleavage statistics and ETKDGv3 parent-molecule SDF files from flat fragment CSV."
)
parser.add_argument("--input", required=True)
parser.add_argument("--output-dir", required=True)
return parser
def main(argv: list[str] | None = None) -> None:
args = build_parser().parse_args(argv)
dataframe = pd.read_csv(args.input)
output_dir = Path(args.output_dir)
sdf_dir = output_dir / "sdf"
output_dir.mkdir(parents=True, exist_ok=True)
sdf_dir.mkdir(parents=True, exist_ok=True)
position_counts = Counter(int(position) for position in dataframe["cleavage_position"])
stats = {
"position_counts": dict(sorted(position_counts.items())),
"total_fragments": int(len(dataframe)),
"total_parent_molecules": int(dataframe["parent_id"].nunique()),
}
(output_dir / "cleavage_position_statistics.json").write_text(
json.dumps(stats, indent=2, ensure_ascii=False) + "\n",
encoding="utf-8",
)
parent_rows = dataframe[["parent_id", "parent_smiles"]].drop_duplicates()
for parent in parent_rows.itertuples(index=False):
mol = Chem.MolFromSmiles(parent.parent_smiles)
if mol is None:
continue
mol = Chem.AddHs(mol)
params = AllChem.ETKDGv3()
if AllChem.EmbedMolecule(mol, params) != 0:
continue
AllChem.UFFOptimizeMolecule(mol)
writer = Chem.SDWriter(str(sdf_dir / f"{parent.parent_id}_3d.sdf"))
writer.write(mol)
writer.close()
if __name__ == "__main__":
main()