feat(toolkit): add classification and migration
Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API.
This commit is contained in:
@@ -1,11 +1,18 @@
|
||||
# scripts
|
||||
|
||||
这些脚本现在都是基于 `macro_lactone_toolkit.*` 的薄封装或迁移提示。
|
||||
这些脚本都基于 `macro_lactone_toolkit.*` 的正式包接口,不再依赖旧的 `src.*` 模块。
|
||||
|
||||
- `batch_process.py`: 等价于 `macro-lactone-toolkit fragment`
|
||||
- `batch_process_ring16.py`: 等价于 `macro-lactone-toolkit fragment --ring-size 16`
|
||||
- `batch_process_multi_rings.py`: 自动识别模式的批处理封装
|
||||
- `analyze_fragments.py`: 等价于 `macro-lactone-toolkit analyze`
|
||||
- `batch_process.py`: 读取分子 CSV,输出 flat `fragments.csv`、`errors.csv` 和处理摘要 JSON
|
||||
- `batch_process_ring16.py`: 固定 `--ring-size 16` 的批处理入口
|
||||
- `batch_process_multi_rings.py`: 自动识别 12-20 元环的批处理入口
|
||||
- `analyze_fragments.py`: 读取 flat fragment CSV,生成位置统计、性质汇总和频率图
|
||||
- `generate_sdf_and_statistics.py`: 读取 flat fragment CSV,生成 cleavage 统计 JSON 和 3D SDF
|
||||
- `tylosin_splicer.py`: 使用 `macro_lactone_toolkit.splicing.*` 做简单拼接
|
||||
|
||||
核心实现与正式接口都在 `src/macro_lactone_toolkit/` 中。
|
||||
推荐工作流:
|
||||
|
||||
```bash
|
||||
python scripts/batch_process.py --input molecules.csv --output fragments.csv --errors-output errors.csv
|
||||
python scripts/analyze_fragments.py --input fragments.csv --output-dir analysis
|
||||
python scripts/generate_sdf_and_statistics.py --input fragments.csv --output-dir sdf_output
|
||||
```
|
||||
|
||||
@@ -1,10 +1,74 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
from macro_lactone_toolkit.cli import main
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Analyze flat fragment CSV output and generate reports.")
|
||||
parser.add_argument("--input", required=True)
|
||||
parser.add_argument("--output-dir", required=True)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> None:
|
||||
args = build_parser().parse_args(argv)
|
||||
dataframe = pd.read_csv(args.input)
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
position_stats = (
|
||||
dataframe.groupby("cleavage_position")
|
||||
.agg(
|
||||
total_count=("fragment_id", "size"),
|
||||
unique_fragments=("fragment_smiles_plain", "nunique"),
|
||||
mean_atom_count=("atom_count", "mean"),
|
||||
mean_molecular_weight=("molecular_weight", "mean"),
|
||||
)
|
||||
.reset_index()
|
||||
.sort_values("cleavage_position")
|
||||
)
|
||||
position_stats.to_csv(output_dir / "position_statistics.csv", index=False)
|
||||
|
||||
property_summary = pd.DataFrame(
|
||||
[
|
||||
{
|
||||
"unique_parents": dataframe["parent_id"].nunique(),
|
||||
"total_fragments": len(dataframe),
|
||||
"unique_fragments": dataframe["fragment_smiles_plain"].nunique(),
|
||||
"mean_atom_count": dataframe["atom_count"].mean(),
|
||||
"mean_molecular_weight": dataframe["molecular_weight"].mean(),
|
||||
}
|
||||
]
|
||||
)
|
||||
property_summary.to_csv(output_dir / "fragment_property_summary.csv", index=False)
|
||||
|
||||
figure, axis = plt.subplots(figsize=(10, 6))
|
||||
axis.bar(position_stats["cleavage_position"], position_stats["total_count"], color="steelblue")
|
||||
axis.set_xlabel("Cleavage Position")
|
||||
axis.set_ylabel("Fragment Count")
|
||||
axis.set_title("Fragment Frequency by Cleavage Position")
|
||||
axis.grid(axis="y", alpha=0.3)
|
||||
figure.tight_layout()
|
||||
figure.savefig(output_dir / "position_frequencies.png", dpi=300, bbox_inches="tight")
|
||||
plt.close(figure)
|
||||
|
||||
summary_lines = [
|
||||
f"Input file: {args.input}",
|
||||
f"Rows: {len(dataframe)}",
|
||||
f"Unique parent molecules: {dataframe['parent_id'].nunique()}",
|
||||
f"Unique fragments: {dataframe['fragment_smiles_plain'].nunique()}",
|
||||
f"Most frequent cleavage position: {int(position_stats.sort_values('total_count', ascending=False).iloc[0]['cleavage_position'])}",
|
||||
]
|
||||
(output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.argv = ["macro-lactone-toolkit", "analyze", *sys.argv[1:]]
|
||||
main()
|
||||
|
||||
@@ -1,10 +1,67 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from macro_lactone_toolkit.cli import main
|
||||
import pandas as pd
|
||||
|
||||
from macro_lactone_toolkit.workflows import _fragment_csv_with_errors, results_to_dataframe
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(description="Batch fragment macrolactones into a flat CSV workflow.")
|
||||
parser.add_argument("--input", required=True)
|
||||
parser.add_argument("--output", required=True)
|
||||
parser.add_argument("--errors-output", default=None)
|
||||
parser.add_argument("--summary-output", default=None)
|
||||
parser.add_argument("--smiles-column", default="smiles")
|
||||
parser.add_argument("--id-column", default="id")
|
||||
parser.add_argument("--ring-size", type=int, default=None)
|
||||
parser.add_argument("--max-rows", type=int, default=None)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> None:
|
||||
args = build_parser().parse_args(argv)
|
||||
results, errors = _fragment_csv_with_errors(
|
||||
input_csv=args.input,
|
||||
smiles_column=args.smiles_column,
|
||||
id_column=args.id_column,
|
||||
ring_size=args.ring_size,
|
||||
max_rows=args.max_rows,
|
||||
)
|
||||
|
||||
fragments = results_to_dataframe(results)
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fragments.to_csv(output_path, index=False)
|
||||
|
||||
if args.errors_output:
|
||||
errors_output = Path(args.errors_output)
|
||||
errors_output.parent.mkdir(parents=True, exist_ok=True)
|
||||
pd.DataFrame(
|
||||
[
|
||||
{key: value for key, value in error.items() if key != "exception"}
|
||||
for error in errors
|
||||
]
|
||||
).to_csv(errors_output, index=False)
|
||||
|
||||
summary = {
|
||||
"processed": len(results) + len(errors),
|
||||
"successful": len(results),
|
||||
"failed": len(errors),
|
||||
"fragments": int(len(fragments)),
|
||||
"ring_size": args.ring_size,
|
||||
"output": str(output_path),
|
||||
}
|
||||
if args.summary_output:
|
||||
summary_path = Path(args.summary_output)
|
||||
summary_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
else:
|
||||
print(json.dumps(summary, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]]
|
||||
main()
|
||||
|
||||
@@ -2,9 +2,8 @@ from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from macro_lactone_toolkit.cli import main
|
||||
from scripts.batch_process import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]]
|
||||
main()
|
||||
main(sys.argv[1:])
|
||||
|
||||
@@ -2,9 +2,8 @@ from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
||||
from macro_lactone_toolkit.cli import main
|
||||
from scripts.batch_process import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.argv = ["macro-lactone-toolkit", "fragment", "--ring-size", "16", *sys.argv[1:]]
|
||||
main()
|
||||
main(["--ring-size", "16", *sys.argv[1:]])
|
||||
|
||||
@@ -1,11 +1,56 @@
|
||||
from macro_lactone_toolkit import FragmentationResult
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raise SystemExit(
|
||||
"Legacy helper retired. Use 'macro-lactone-toolkit fragment' to export fragments, "
|
||||
"then generate SDF/statistics in downstream analysis code."
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate cleavage statistics and ETKDGv3 parent-molecule SDF files from flat fragment CSV."
|
||||
)
|
||||
parser.add_argument("--input", required=True)
|
||||
parser.add_argument("--output-dir", required=True)
|
||||
return parser
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> None:
|
||||
args = build_parser().parse_args(argv)
|
||||
dataframe = pd.read_csv(args.input)
|
||||
output_dir = Path(args.output_dir)
|
||||
sdf_dir = output_dir / "sdf"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
sdf_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
position_counts = Counter(int(position) for position in dataframe["cleavage_position"])
|
||||
stats = {
|
||||
"position_counts": dict(sorted(position_counts.items())),
|
||||
"total_fragments": int(len(dataframe)),
|
||||
"total_parent_molecules": int(dataframe["parent_id"].nunique()),
|
||||
}
|
||||
(output_dir / "cleavage_position_statistics.json").write_text(
|
||||
json.dumps(stats, indent=2, ensure_ascii=False) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
parent_rows = dataframe[["parent_id", "parent_smiles"]].drop_duplicates()
|
||||
for parent in parent_rows.itertuples(index=False):
|
||||
mol = Chem.MolFromSmiles(parent.parent_smiles)
|
||||
if mol is None:
|
||||
continue
|
||||
mol = Chem.AddHs(mol)
|
||||
params = AllChem.ETKDGv3()
|
||||
if AllChem.EmbedMolecule(mol, params) != 0:
|
||||
continue
|
||||
AllChem.UFFOptimizeMolecule(mol)
|
||||
writer = Chem.SDWriter(str(sdf_dir / f"{parent.parent_id}_3d.sdf"))
|
||||
writer.write(mol)
|
||||
writer.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user