From c0ead42384238b6381f12e8d15c365dbc7f03efb Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Wed, 18 Mar 2026 23:56:41 +0800 Subject: [PATCH] feat(toolkit): add classification and migration Implement the standard/non-standard/not-macrolactone classification layer and integrate it into analyzer, fragmenter, and CLI outputs. Port the remaining legacy package capabilities into new visualization and workflow modules, restore batch/statistics/SDF scripts on top of the flat CSV workflow, and update active docs to the new package API. --- docs/SUMMARY.md | 270 +++------------------ docs/project-docs/QUICK_COMMANDS.md | 29 +-- notebooks/README_analyze_ring16.md | 27 +-- pixi.toml | 1 + pyproject.toml | 1 + scripts/README.md | 19 +- scripts/analyze_fragments.py | 70 +++++- scripts/batch_process.py | 63 ++++- scripts/batch_process_multi_rings.py | 5 +- scripts/batch_process_ring16.py | 5 +- scripts/generate_sdf_and_statistics.py | 55 ++++- src/macro_lactone_toolkit/__init__.py | 28 ++- src/macro_lactone_toolkit/_core.py | 130 +++++++++- src/macro_lactone_toolkit/analyzer.py | 118 ++++++++- src/macro_lactone_toolkit/cli.py | 4 +- src/macro_lactone_toolkit/fragmenter.py | 12 +- src/macro_lactone_toolkit/models.py | 20 ++ src/macro_lactone_toolkit/visualization.py | 180 ++++++++++++++ src/macro_lactone_toolkit/workflows.py | 180 ++++++++++++++ tests/helpers.py | 107 +++++++- tests/test_cli.py | 61 ++++- tests/test_detection_and_numbering.py | 105 +++++++- tests/test_scripts_and_docs.py | 149 ++++++++++++ tests/test_visualization_and_workflows.py | 171 +++++++++++++ 24 files changed, 1497 insertions(+), 313 deletions(-) create mode 100644 src/macro_lactone_toolkit/visualization.py create mode 100644 src/macro_lactone_toolkit/workflows.py create mode 100644 tests/test_scripts_and_docs.py create mode 100644 tests/test_visualization_and_workflows.py diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 987c6e7..a3fcb0e 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -1,243 +1,49 @@ -# Macro Split 项目文档总结 +# Macro Split 文档摘要 -本文档汇总了仓库中所有 Markdown 文件的内容摘要。 +当前仓库的正式接口全部集中在 `src/macro_lactone_toolkit/`,核心能力包括: ---- +- `MacroLactoneAnalyzer` + - 分子级分类:`standard_macrolactone` / `non_standard_macrocycle` / `not_macrolactone` + - 12-20 元大环内酯识别 + - 批量统计、DataFrame 分类、动态 SMARTS、基本理化性质 +- `MacrolactoneFragmenter` + - 标准大环内酯编号 + - 侧链裂解 + - flat JSON/CSV 输出 +- `macro_lactone_toolkit.visualization` + - 编号分子 SVG/PNG + - 碎片 SVG/PNG +- `macro_lactone_toolkit.workflows` + - CSV 批量裂解 + - `FragmentationResult` 转 DataFrame + - JSON 导出 + - 编号图片 + 标注 CSV 导出 +- `macro_lactone_toolkit.splicing` + - 通用大环内酯 scaffold 预处理 + - 片段活化和拼接 -## 1. README.md (项目主文档) +推荐起步方式: -**位置**: `/README.md` - -### 项目简介 -Macrolactone Fragmenter 是一个专业的大环内酯(12-20元环)侧链断裂和分析工具。 - -### 主要特性 -- **智能环原子编号** - 支持 12-20 元环,基于内酯结构的固定编号系统 -- **自动侧链断裂** - 智能识别并断裂所有侧链 -- **强大的可视化** - SVG + PNG 输出 -- **多种导出格式** - JSON、CSV、DataFrame -- **批量处理** - 支持 2000+ 分子的大规模分析 - -### 安装方式 -```bash -# 使用 Pixi(推荐) -pixi install && pixi shell - -# 使用 Pip -conda install -c conda-forge rdkit -pip install -e . -``` - -### 基本用法 ```python -from src.macrolactone_fragmenter import MacrolactoneFragmenter -fragmenter = MacrolactoneFragmenter(ring_size=16) -result = fragmenter.process_molecule(smiles, parent_id="mol_001") +from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter +from macro_lactone_toolkit.workflows import fragment_csv, results_to_dataframe + +analyzer = MacroLactoneAnalyzer() +classification = analyzer.classify_macrocycle(smiles) + +fragmenter = MacrolactoneFragmenter() +result = fragmenter.fragment_molecule(smiles, parent_id="mol_001") + +results = fragment_csv("molecules.csv") +fragments_df = results_to_dataframe(results) ``` ---- +推荐脚本工作流: -## 2. CLEANUP_SUMMARY.md (清理总结) - -**位置**: `/CLEANUP_SUMMARY.md` - -### 内容概要 -记录了项目根目录的清理工作: -- **保留的文件**: README.md, DOCUMENTATION_GUIDE.md, QUICK_COMMANDS.md -- **归档的文件**: 14 个历史文档已移至 `archive/` 目录 -- **清理前**: 17 个 MD 文件,约 120KB -- **清理后**: 3 个核心 MD 文件 + 30+ 个文档系统文件 - ---- - -## 3. DOCUMENTATION_GUIDE.md (文档系统指南) - -**位置**: `/DOCUMENTATION_GUIDE.md` - -### 文档系统特性 -- 使用 **MkDocs + Material 主题 + mkdocstrings** 构建 -- 支持中文、深色/浅色模式 -- 自动从代码生成 API 文档 -- 支持数学公式(MathJax) - -### 常用命令 ```bash -# 本地预览 -pixi run mkdocs serve - -# 构建静态网站 -pixi run mkdocs build - -# 部署到 GitHub Pages -pixi run mkdocs gh-deploy +python scripts/batch_process.py --input molecules.csv --output fragments.csv --errors-output errors.csv +python scripts/analyze_fragments.py --input fragments.csv --output-dir analysis +python scripts/generate_sdf_and_statistics.py --input fragments.csv --output-dir sdf_output ``` -### 添加新文档步骤 -1. 在 `docs/` 创建 `.md` 文件 -2. 编辑内容 -3. 在 `mkdocs.yml` 的 `nav` 部分添加链接 -4. 运行预览验证 - ---- - -## 4. IMPLEMENTATION_SUMMARY.md (实现总结) - -**位置**: `/IMPLEMENTATION_SUMMARY.md` - -### MacroLactoneAnalyzer 封装 -新增 `src/macro_lactone_analyzer.py` 模块,提供: - -#### 静态方法 -- `detect_ring_sizes(mol)` - 识别环大小 -- `is_valid_macrolactone(mol, size)` - 验证大环内酯 -- `analyze_smiles(smiles)` - 单分子分析 -- `dynamic_smarts_match(smiles, ring_size)` - 动态 SMARTS 匹配 - -#### 实例方法 -- `get_single_ring_info(smiles)` - 单分子详细信息 -- `analyze_list(smiles_list)` - 批量分析 -- `classify_molecules(df)` - DataFrame 分类 - -### 特性 -- 高复用性、类型安全、详细错误处理 -- 支持 12-20 元环分析 -- 版本号更新至 2.0.0 - ---- - -## 5. QUICK_COMMANDS.md (快速命令参考) - -**位置**: `/QUICK_COMMANDS.md` - -### 文档命令 -```bash -pixi run mkdocs serve # 启动文档服务器 -pixi run mkdocs build # 构建静态文档 -pixi run mkdocs gh-deploy # 部署到 GitHub Pages -``` - -### 安装命令 -```bash -pixi install && pixi shell # Pixi 方式 -pip install -e . # 开发模式 -``` - -### 开发工具 -```bash -pixi run black src/ # 格式化代码 -pixi run flake8 src/ # 检查代码质量 -pixi run pytest # 运行测试 -``` - ---- - -## 6. notebooks/README_analyze_ring16.md (Notebook 说明) - -**位置**: `/notebooks/README_analyze_ring16.md` - -### 文件说明 -- **Notebook**: `analyze_ring16_molecules.ipynb` -- **输入**: `../output/ring16_match_smarts.csv` (307个分子) - -### 分析内容 -1. **分子基本性质**: 分子量、LogP、QED、TPSA 等 -2. **侧链断裂分析**: 使用 MacrolactoneFragmenter 类 -3. **分布图绘制**: 4x4 子图布局,位置 3-16 的分布 - -### 输出文件 -- `ring16_molecular_properties_distribution.png` -- `atom_count_distribution_ring16.png` -- `molecular_weight_distribution_ring16.png` -- `ring16_fragments_analysis.csv` - -### 延伸分析建议 -- LogP/QED/TPSA 分析 -- SAR 分析(如有活性数据) -- 碎片多样性分析 -- 聚类分析 - ---- - -## 7. scripts/README.md (脚本使用说明) - -**位置**: `/scripts/README.md` - -### 脚本列表 - -#### batch_process_ring16.py -- 处理 16 元环分子(1241个) -- 输入: `ring16/temp_filtered_complete.csv` -- 输出: `output/ring16_fragments/` - -#### batch_process_multi_rings.py -- 处理 12-20 元环的所有分子 -- 自动按环大小分类 -- 检测并剔除含多个内酯键的分子 - -### 输出文件格式 -```json -{ - "parent_id": "ring16_mol_0", - "parent_smiles": "...", - "fragments": [ - { - "fragment_smiles": "CC(C)C", - "cleavage_position": 5, - "atom_count": 4, - "molecular_weight": 58.12 - } - ] -} -``` - -### 日志文件 -- `processing_log_*.txt` - 处理过程 -- `error_log_*.txt` - 错误记录 -- `multiple_lactone_log_*.txt` - 多内酯键分子 - ---- - -## 项目结构概览 - -``` -macro_split/ -├── src/ # 核心源代码 -│ ├── macrolactone_fragmenter.py # 高级封装类 -│ ├── macro_lactone_analyzer.py # 环数分析器 -│ ├── ring_numbering.py # 环编号系统 -│ ├── ring_visualization.py # 可视化工具 -│ └── fragment_dataclass.py # 碎片数据类 -├── notebooks/ # Jupyter Notebook 示例 -├── scripts/ # 批量处理脚本 -├── docs/ # 文档目录 -├── tests/ # 单元测试 -├── pyproject.toml # 项目配置 -├── setup.py # 打包脚本 -├── pixi.toml # Pixi 环境配置 -└── mkdocs.yml # 文档配置 -``` - ---- - -## 快速开始 - -1. **安装环境** - ```bash - pixi install && pixi shell - ``` - -2. **测试导入** - ```python - from src.macrolactone_fragmenter import MacrolactoneFragmenter - fragmenter = MacrolactoneFragmenter(ring_size=16) - ``` - -3. **查看文档** - ```bash - pixi run mkdocs serve - # 访问 http://localhost:8000 - ``` - ---- - -*文档生成日期: 2025-01-23* +活动文档和脚本都基于 `macro_lactone_toolkit.*`。历史 notebook `.ipynb` 快照保留作归档参考,但不再作为当前 API 文档。 diff --git a/docs/project-docs/QUICK_COMMANDS.md b/docs/project-docs/QUICK_COMMANDS.md index cd971f7..0a74be0 100644 --- a/docs/project-docs/QUICK_COMMANDS.md +++ b/docs/project-docs/QUICK_COMMANDS.md @@ -33,23 +33,20 @@ pip install . ## 🧪 测试安装 ```python -# 测试导入 -from src.macrolactone_fragmenter import MacrolactoneFragmenter -print("✓ 安装成功!") +from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter +from macro_lactone_toolkit.workflows import fragment_csv, results_to_dataframe -# 快速测试 -fragmenter = MacrolactoneFragmenter(ring_size=16) -print(f"✓ 初始化成功!ring_size={fragmenter.ring_size}") +analyzer = MacroLactoneAnalyzer() +fragmenter = MacrolactoneFragmenter() +print("✓ 安装成功!") ``` ## 📓 运行示例 ```bash -# Jupyter Notebook -pixi run jupyter notebook notebooks/filter_molecules.ipynb - -# 或启动 Jupyter Lab -pixi run jupyter lab +pixi run macro-lactone-toolkit analyze --smiles 'O=C1CCCCCCCCCCCCCCO1' +python scripts/batch_process.py --input molecules.csv --output fragments.csv --errors-output errors.csv +python scripts/analyze_fragments.py --input fragments.csv --output-dir analysis ``` ## 🔍 项目结构 @@ -129,11 +126,10 @@ cat pyproject.toml | grep version | 文件 | 说明 | |------|------| | `README.md` | 项目主文档 | -| `DOCUMENTATION_GUIDE.md` | 文档系统使用指南 | -| `PROJECT_COMPLETION_SUMMARY.md` | 项目完成总结 | -| `QUICK_COMMANDS.md` | 本文件 | +| `docs/SUMMARY.md` | 当前 API 和工作流摘要 | +| `scripts/README.md` | 脚本工作流说明 | +| `src/macro_lactone_toolkit/` | 正式包实现 | | `pyproject.toml` | Python 项目配置 | -| `setup.py` | 打包脚本 | | `mkdocs.yml` | 文档配置 | | `pixi.toml` | Pixi 环境配置 | @@ -145,5 +141,4 @@ cat pyproject.toml | grep version --- -**需要帮助?** 查看 `DOCUMENTATION_GUIDE.md` 或运行 `pixi run mkdocs serve` - +**需要帮助?** 查看 `docs/SUMMARY.md`、`scripts/README.md` 或运行 `pixi run mkdocs serve` diff --git a/notebooks/README_analyze_ring16.md b/notebooks/README_analyze_ring16.md index 8687e97..5c30cc9 100644 --- a/notebooks/README_analyze_ring16.md +++ b/notebooks/README_analyze_ring16.md @@ -102,19 +102,15 @@ notebook的最后一个单元格(Section 9)提供了详细的延伸分析建 notebook中包含了完整的代码示例,可以直接运行或修改。主要功能: ```python -# 1. 计算分子性质 -props = calculate_properties(smiles) +from macro_lactone_toolkit import MacroLactoneAnalyzer +from macro_lactone_toolkit.workflows import fragment_csv, results_to_dataframe -# 2. 批量断裂 -fragmenter = MacrolactoneFragmenter(ring_size=16) -batch_results = fragmenter.process_csv(csv_file) +analyzer = MacroLactoneAnalyzer() +classification = analyzer.classify_macrocycle(smiles, ring_size=16) -# 3. 统计分析 -df_fragments = fragmenter.batch_to_dataframe(batch_results) -position_stats = df_fragments.groupby('cleavage_position').agg(...) - -# 4. 绘图 -sns.histplot(values, kde=True, ax=ax, bins=30) +batch_results = fragment_csv(csv_file, ring_size=16) +df_fragments = results_to_dataframe(batch_results) +position_stats = df_fragments.groupby("cleavage_position").agg(...) ``` ## 关键洞察 @@ -174,10 +170,10 @@ sns.histplot(values, kde=True, ax=ax, bins=30) ## 参考 -- `filter_molecules.ipynb` - 分子过滤和断裂逻辑 -- `test_align_two_molecules.ipynb` - 绘图逻辑参考 -- `src/macrolactone_fragmenter.py` - 封装的断裂器类 -- `src/ring_visualization.py` - 可视化工具 +- `scripts/batch_process_ring16.py` - 16 元环 flat workflow 入口 +- `scripts/analyze_fragments.py` - 位置统计和图表输出 +- `src/macro_lactone_toolkit/fragmenter.py` - 标准大环内酯裂解器 +- `src/macro_lactone_toolkit/visualization.py` - 编号和碎片可视化工具 ## 问题反馈 @@ -187,4 +183,3 @@ sns.histplot(values, kde=True, ax=ax, bins=30) 3. 查看输出目录是否有写入权限 4. 检查CSV文件路径是否正确 - diff --git a/pixi.toml b/pixi.toml index c751e27..c53bd4d 100644 --- a/pixi.toml +++ b/pixi.toml @@ -18,6 +18,7 @@ pytest = ">=8.3,<9" rdkit = ">=2025.9.1,<2026" pandas = ">=2.3.3,<3" numpy = ">=2.3.4,<3" +matplotlib = ">=3.10,<4" [pypi-dependencies] macro_lactone_toolkit = { path = ".", editable = true } diff --git a/pyproject.toml b/pyproject.toml index ba8e662..dfa1b55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ authors = [ { name = "Macro Split Team" }, ] dependencies = [ + "matplotlib>=3.10", "numpy>=1.26", "pandas>=2.2", ] diff --git a/scripts/README.md b/scripts/README.md index 78c4a18..28eb142 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,11 +1,18 @@ # scripts -这些脚本现在都是基于 `macro_lactone_toolkit.*` 的薄封装或迁移提示。 +这些脚本都基于 `macro_lactone_toolkit.*` 的正式包接口,不再依赖旧的 `src.*` 模块。 -- `batch_process.py`: 等价于 `macro-lactone-toolkit fragment` -- `batch_process_ring16.py`: 等价于 `macro-lactone-toolkit fragment --ring-size 16` -- `batch_process_multi_rings.py`: 自动识别模式的批处理封装 -- `analyze_fragments.py`: 等价于 `macro-lactone-toolkit analyze` +- `batch_process.py`: 读取分子 CSV,输出 flat `fragments.csv`、`errors.csv` 和处理摘要 JSON +- `batch_process_ring16.py`: 固定 `--ring-size 16` 的批处理入口 +- `batch_process_multi_rings.py`: 自动识别 12-20 元环的批处理入口 +- `analyze_fragments.py`: 读取 flat fragment CSV,生成位置统计、性质汇总和频率图 +- `generate_sdf_and_statistics.py`: 读取 flat fragment CSV,生成 cleavage 统计 JSON 和 3D SDF - `tylosin_splicer.py`: 使用 `macro_lactone_toolkit.splicing.*` 做简单拼接 -核心实现与正式接口都在 `src/macro_lactone_toolkit/` 中。 +推荐工作流: + +```bash +python scripts/batch_process.py --input molecules.csv --output fragments.csv --errors-output errors.csv +python scripts/analyze_fragments.py --input fragments.csv --output-dir analysis +python scripts/generate_sdf_and_statistics.py --input fragments.csv --output-dir sdf_output +``` diff --git a/scripts/analyze_fragments.py b/scripts/analyze_fragments.py index 4febb62..0070da6 100644 --- a/scripts/analyze_fragments.py +++ b/scripts/analyze_fragments.py @@ -1,10 +1,74 @@ from __future__ import annotations -import sys +import argparse +from pathlib import Path -from macro_lactone_toolkit.cli import main +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt +import pandas as pd + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Analyze flat fragment CSV output and generate reports.") + parser.add_argument("--input", required=True) + parser.add_argument("--output-dir", required=True) + return parser + + +def main(argv: list[str] | None = None) -> None: + args = build_parser().parse_args(argv) + dataframe = pd.read_csv(args.input) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + position_stats = ( + dataframe.groupby("cleavage_position") + .agg( + total_count=("fragment_id", "size"), + unique_fragments=("fragment_smiles_plain", "nunique"), + mean_atom_count=("atom_count", "mean"), + mean_molecular_weight=("molecular_weight", "mean"), + ) + .reset_index() + .sort_values("cleavage_position") + ) + position_stats.to_csv(output_dir / "position_statistics.csv", index=False) + + property_summary = pd.DataFrame( + [ + { + "unique_parents": dataframe["parent_id"].nunique(), + "total_fragments": len(dataframe), + "unique_fragments": dataframe["fragment_smiles_plain"].nunique(), + "mean_atom_count": dataframe["atom_count"].mean(), + "mean_molecular_weight": dataframe["molecular_weight"].mean(), + } + ] + ) + property_summary.to_csv(output_dir / "fragment_property_summary.csv", index=False) + + figure, axis = plt.subplots(figsize=(10, 6)) + axis.bar(position_stats["cleavage_position"], position_stats["total_count"], color="steelblue") + axis.set_xlabel("Cleavage Position") + axis.set_ylabel("Fragment Count") + axis.set_title("Fragment Frequency by Cleavage Position") + axis.grid(axis="y", alpha=0.3) + figure.tight_layout() + figure.savefig(output_dir / "position_frequencies.png", dpi=300, bbox_inches="tight") + plt.close(figure) + + summary_lines = [ + f"Input file: {args.input}", + f"Rows: {len(dataframe)}", + f"Unique parent molecules: {dataframe['parent_id'].nunique()}", + f"Unique fragments: {dataframe['fragment_smiles_plain'].nunique()}", + f"Most frequent cleavage position: {int(position_stats.sort_values('total_count', ascending=False).iloc[0]['cleavage_position'])}", + ] + (output_dir / "analysis_summary.txt").write_text("\n".join(summary_lines) + "\n", encoding="utf-8") if __name__ == "__main__": - sys.argv = ["macro-lactone-toolkit", "analyze", *sys.argv[1:]] main() diff --git a/scripts/batch_process.py b/scripts/batch_process.py index d9f6709..ceb37a7 100644 --- a/scripts/batch_process.py +++ b/scripts/batch_process.py @@ -1,10 +1,67 @@ from __future__ import annotations -import sys +import argparse +import json +from pathlib import Path -from macro_lactone_toolkit.cli import main +import pandas as pd + +from macro_lactone_toolkit.workflows import _fragment_csv_with_errors, results_to_dataframe + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(description="Batch fragment macrolactones into a flat CSV workflow.") + parser.add_argument("--input", required=True) + parser.add_argument("--output", required=True) + parser.add_argument("--errors-output", default=None) + parser.add_argument("--summary-output", default=None) + parser.add_argument("--smiles-column", default="smiles") + parser.add_argument("--id-column", default="id") + parser.add_argument("--ring-size", type=int, default=None) + parser.add_argument("--max-rows", type=int, default=None) + return parser + + +def main(argv: list[str] | None = None) -> None: + args = build_parser().parse_args(argv) + results, errors = _fragment_csv_with_errors( + input_csv=args.input, + smiles_column=args.smiles_column, + id_column=args.id_column, + ring_size=args.ring_size, + max_rows=args.max_rows, + ) + + fragments = results_to_dataframe(results) + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + fragments.to_csv(output_path, index=False) + + if args.errors_output: + errors_output = Path(args.errors_output) + errors_output.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame( + [ + {key: value for key, value in error.items() if key != "exception"} + for error in errors + ] + ).to_csv(errors_output, index=False) + + summary = { + "processed": len(results) + len(errors), + "successful": len(results), + "failed": len(errors), + "fragments": int(len(fragments)), + "ring_size": args.ring_size, + "output": str(output_path), + } + if args.summary_output: + summary_path = Path(args.summary_output) + summary_path.parent.mkdir(parents=True, exist_ok=True) + summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + else: + print(json.dumps(summary, indent=2, ensure_ascii=False)) if __name__ == "__main__": - sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]] main() diff --git a/scripts/batch_process_multi_rings.py b/scripts/batch_process_multi_rings.py index d9f6709..9f25d3c 100644 --- a/scripts/batch_process_multi_rings.py +++ b/scripts/batch_process_multi_rings.py @@ -2,9 +2,8 @@ from __future__ import annotations import sys -from macro_lactone_toolkit.cli import main +from scripts.batch_process import main if __name__ == "__main__": - sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]] - main() + main(sys.argv[1:]) diff --git a/scripts/batch_process_ring16.py b/scripts/batch_process_ring16.py index 52c9a7c..9a4a282 100644 --- a/scripts/batch_process_ring16.py +++ b/scripts/batch_process_ring16.py @@ -2,9 +2,8 @@ from __future__ import annotations import sys -from macro_lactone_toolkit.cli import main +from scripts.batch_process import main if __name__ == "__main__": - sys.argv = ["macro-lactone-toolkit", "fragment", "--ring-size", "16", *sys.argv[1:]] - main() + main(["--ring-size", "16", *sys.argv[1:]]) diff --git a/scripts/generate_sdf_and_statistics.py b/scripts/generate_sdf_and_statistics.py index 4dc6d6a..e1bb21d 100644 --- a/scripts/generate_sdf_and_statistics.py +++ b/scripts/generate_sdf_and_statistics.py @@ -1,11 +1,56 @@ -from macro_lactone_toolkit import FragmentationResult +from __future__ import annotations + +import argparse +import json +from collections import Counter +from pathlib import Path + +import pandas as pd +from rdkit import Chem +from rdkit.Chem import AllChem -def main() -> None: - raise SystemExit( - "Legacy helper retired. Use 'macro-lactone-toolkit fragment' to export fragments, " - "then generate SDF/statistics in downstream analysis code." +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Generate cleavage statistics and ETKDGv3 parent-molecule SDF files from flat fragment CSV." ) + parser.add_argument("--input", required=True) + parser.add_argument("--output-dir", required=True) + return parser + + +def main(argv: list[str] | None = None) -> None: + args = build_parser().parse_args(argv) + dataframe = pd.read_csv(args.input) + output_dir = Path(args.output_dir) + sdf_dir = output_dir / "sdf" + output_dir.mkdir(parents=True, exist_ok=True) + sdf_dir.mkdir(parents=True, exist_ok=True) + + position_counts = Counter(int(position) for position in dataframe["cleavage_position"]) + stats = { + "position_counts": dict(sorted(position_counts.items())), + "total_fragments": int(len(dataframe)), + "total_parent_molecules": int(dataframe["parent_id"].nunique()), + } + (output_dir / "cleavage_position_statistics.json").write_text( + json.dumps(stats, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + + parent_rows = dataframe[["parent_id", "parent_smiles"]].drop_duplicates() + for parent in parent_rows.itertuples(index=False): + mol = Chem.MolFromSmiles(parent.parent_smiles) + if mol is None: + continue + mol = Chem.AddHs(mol) + params = AllChem.ETKDGv3() + if AllChem.EmbedMolecule(mol, params) != 0: + continue + AllChem.UFFOptimizeMolecule(mol) + writer = Chem.SDWriter(str(sdf_dir / f"{parent.parent_id}_3d.sdf")) + writer.write(mol) + writer.close() if __name__ == "__main__": diff --git a/src/macro_lactone_toolkit/__init__.py b/src/macro_lactone_toolkit/__init__.py index 133d893..51389b3 100644 --- a/src/macro_lactone_toolkit/__init__.py +++ b/src/macro_lactone_toolkit/__init__.py @@ -7,19 +7,45 @@ from .errors import ( RingNumberingError, ) from .fragmenter import MacrolactoneFragmenter -from .models import FragmentationResult, RingNumberingResult, SideChainFragment +from .models import ( + FragmentationResult, + MacrocycleClassificationResult, + RingNumberingResult, + SideChainFragment, +) +from .visualization import ( + fragment_svg, + numbered_molecule_svg, + save_fragment_png, + save_numbered_molecule_png, +) +from .workflows import ( + export_numbered_macrolactone_csv, + fragment_csv, + results_to_dataframe, + write_result_json, +) __all__ = [ "AmbiguousMacrolactoneError", "FragmentationError", "FragmentationResult", + "fragment_csv", + "fragment_svg", "MacroLactoneAnalyzer", "MacrolactoneDetectionError", "MacrolactoneError", "MacrolactoneFragmenter", + "MacrocycleClassificationResult", + "numbered_molecule_svg", "RingNumberingError", "RingNumberingResult", + "results_to_dataframe", + "save_fragment_png", + "save_numbered_molecule_png", "SideChainFragment", + "export_numbered_macrolactone_csv", + "write_result_json", ] __version__ = "0.1.0" diff --git a/src/macro_lactone_toolkit/_core.py b/src/macro_lactone_toolkit/_core.py index 7fbee07..b6b10a2 100644 --- a/src/macro_lactone_toolkit/_core.py +++ b/src/macro_lactone_toolkit/_core.py @@ -6,11 +6,21 @@ from typing import Iterable from rdkit import Chem -from .errors import MacrolactoneDetectionError, RingNumberingError -from .models import RingNumberingResult +from .errors import AmbiguousMacrolactoneError, MacrolactoneDetectionError, RingNumberingError +from .models import MacrocycleClassificationResult, RingNumberingResult VALID_RING_SIZES = tuple(range(12, 21)) +REASON_MESSAGES = { + "contains_non_carbon_ring_atoms_outside_positions_1_2": ( + "Ring positions 3..N contain non-carbon atoms." + ), + "multiple_overlapping_macrocycle_candidates": ( + "Overlapping macrolactone candidate rings were detected." + ), + "no_lactone_ring_in_12_to_20_range": "No 12-20 membered lactone ring was detected.", + "requested_ring_size_not_found": "The requested ring size was not detected as a lactone ring.", +} @dataclass(frozen=True) @@ -73,6 +83,62 @@ def find_macrolactone_candidates( ) +def classify_macrolactone( + mol: Chem.Mol, + smiles: str, + ring_size: int | None = None, +) -> MacrocycleClassificationResult: + candidates = find_macrolactone_candidates(mol, ring_size=ring_size) + candidate_ring_sizes = sorted({candidate.ring_size for candidate in candidates}) + + if not candidates: + reason_code = ( + "requested_ring_size_not_found" + if ring_size is not None + else "no_lactone_ring_in_12_to_20_range" + ) + return _build_classification_result( + smiles=smiles, + classification="not_macrolactone", + ring_size=None, + candidate_ring_sizes=[], + reason_codes=[reason_code], + ) + + if _has_overlapping_candidates(candidates): + return _build_classification_result( + smiles=smiles, + classification="non_standard_macrocycle", + ring_size=candidate_ring_sizes[0] if len(candidate_ring_sizes) == 1 else None, + candidate_ring_sizes=candidate_ring_sizes, + reason_codes=["multiple_overlapping_macrocycle_candidates"], + ) + + if len(candidates) > 1 or len(candidate_ring_sizes) > 1: + raise AmbiguousMacrolactoneError( + "Multiple valid macrolactone candidates were detected. Pass ring_size explicitly." + ) + + candidate = candidates[0] + numbering = build_numbering_result(mol, candidate) + if _contains_non_carbon_atoms_outside_positions_1_2(mol, numbering): + return _build_classification_result( + smiles=smiles, + classification="non_standard_macrocycle", + ring_size=candidate.ring_size, + candidate_ring_sizes=candidate_ring_sizes, + reason_codes=["contains_non_carbon_ring_atoms_outside_positions_1_2"], + ) + + return _build_classification_result( + smiles=smiles, + classification="standard_macrolactone", + ring_size=candidate.ring_size, + candidate_ring_sizes=candidate_ring_sizes, + reason_codes=[], + ) + + def build_numbering_result(mol: Chem.Mol, candidate: DetectedMacrolactone) -> RingNumberingResult: ring_atoms = list(candidate.ring_atoms) ring_atom_set = set(ring_atoms) @@ -120,6 +186,66 @@ def build_numbering_result(mol: Chem.Mol, candidate: DetectedMacrolactone) -> Ri ) +def _build_classification_result( + smiles: str, + classification: str, + ring_size: int | None, + candidate_ring_sizes: list[int], + reason_codes: list[str], +) -> MacrocycleClassificationResult: + reason_messages = [REASON_MESSAGES[reason_code] for reason_code in reason_codes] + return MacrocycleClassificationResult( + smiles=smiles, + classification=classification, + ring_size=ring_size, + primary_reason_code=reason_codes[0] if reason_codes else None, + primary_reason_message=reason_messages[0] if reason_messages else None, + all_reason_codes=list(reason_codes), + all_reason_messages=reason_messages, + candidate_ring_sizes=list(candidate_ring_sizes), + ) + + +def _contains_non_carbon_atoms_outside_positions_1_2( + mol: Chem.Mol, + numbering: RingNumberingResult, +) -> bool: + for position in range(3, numbering.ring_size + 1): + atom_idx = numbering.position_to_atom[position] + if mol.GetAtomWithIdx(atom_idx).GetAtomicNum() != 6: + return True + return False + + +def _has_overlapping_candidates(candidates: list[DetectedMacrolactone]) -> bool: + ring_sets = [set(candidate.ring_atoms) for candidate in candidates] + visited: set[int] = set() + + for start_index in range(len(candidates)): + if start_index in visited: + continue + + queue = deque([start_index]) + component_size = 0 + while queue: + candidate_index = queue.popleft() + if candidate_index in visited: + continue + visited.add(candidate_index) + component_size += 1 + + for neighbor_index in range(len(candidates)): + if neighbor_index == candidate_index or neighbor_index in visited: + continue + if ring_sets[candidate_index].intersection(ring_sets[neighbor_index]): + queue.append(neighbor_index) + + if component_size > 1: + return True + + return False + + def collect_side_chain_atoms( mol: Chem.Mol, start_atom_idx: int, diff --git a/src/macro_lactone_toolkit/analyzer.py b/src/macro_lactone_toolkit/analyzer.py index 5158696..136e137 100644 --- a/src/macro_lactone_toolkit/analyzer.py +++ b/src/macro_lactone_toolkit/analyzer.py @@ -1,8 +1,11 @@ from __future__ import annotations +import pandas as pd from rdkit import Chem +from rdkit.Chem import Crippen, Descriptors, Lipinski, QED -from ._core import ensure_mol, find_macrolactone_candidates +from ._core import classify_macrolactone, ensure_mol, find_macrolactone_candidates +from .models import MacrocycleClassificationResult class MacroLactoneAnalyzer: @@ -13,15 +16,108 @@ class MacroLactoneAnalyzer: candidates = find_macrolactone_candidates(mol) return sorted({candidate.ring_size for candidate in candidates}) - def analyze_molecule(self, mol_input: str | Chem.Mol) -> dict: + def classify_macrocycle( + self, + mol_input: str | Chem.Mol, + ring_size: int | None = None, + ) -> MacrocycleClassificationResult: mol, smiles = ensure_mol(mol_input) - candidates = find_macrolactone_candidates(mol) - valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates}) - is_ambiguous = len(valid_ring_sizes) > 1 or len(candidates) > 1 - return { - "smiles": smiles, - "valid_ring_sizes": valid_ring_sizes, - "candidate_count": len(candidates), - "is_ambiguous": is_ambiguous, - "selected_ring_size": valid_ring_sizes[0] if len(valid_ring_sizes) == 1 and len(candidates) == 1 else None, + return classify_macrolactone(mol, smiles=smiles, ring_size=ring_size) + + def analyze_molecule( + self, + mol_input: str | Chem.Mol, + ring_size: int | None = None, + ) -> dict: + return self.classify_macrocycle(mol_input, ring_size=ring_size).to_dict() + + def analyze_many( + self, + smiles_list: list[str], + ring_range: range = range(12, 21), + ) -> dict: + classification_counts = { + "standard_macrolactone": 0, + "non_standard_macrocycle": 0, + "not_macrolactone": 0, + } + ring_size_counts = {ring_size: 0 for ring_size in ring_range} + results: list[dict] = [] + + for smiles in smiles_list: + classification = self.classify_macrocycle(smiles) + classification_counts[classification.classification] += 1 + if ( + classification.classification == "standard_macrolactone" + and classification.ring_size in ring_size_counts + ): + ring_size_counts[classification.ring_size] += 1 + results.append(classification.to_dict()) + + return { + "total": len(smiles_list), + "classification_counts": classification_counts, + "ring_size_counts": ring_size_counts, + "results": results, + } + + def classify_dataframe( + self, + dataframe: pd.DataFrame, + smiles_column: str = "smiles", + id_column: str | None = None, + ring_range: range = range(12, 21), + ) -> tuple[dict[int, pd.DataFrame], pd.DataFrame]: + grouped_rows: dict[int, list[dict]] = {ring_size: [] for ring_size in ring_range} + rejected_rows: list[dict] = [] + + for index, row in dataframe.iterrows(): + base_row = row.to_dict() + if id_column is None and "id" not in base_row: + base_row["id"] = f"row_{index}" + + classification = self.classify_macrocycle(base_row[smiles_column]) + enriched_row = { + **base_row, + **classification.to_dict(), + } + if ( + classification.classification == "standard_macrolactone" + and classification.ring_size in grouped_rows + ): + grouped_rows[classification.ring_size].append(enriched_row) + else: + rejected_rows.append(enriched_row) + + grouped_frames = { + ring_size: pd.DataFrame(rows) + for ring_size, rows in grouped_rows.items() + if rows + } + return grouped_frames, pd.DataFrame(rejected_rows) + + def match_dynamic_smarts(self, smiles: str, ring_size: int) -> list[int] | None: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return None + query = Chem.MolFromSmarts(f"[r{ring_size}]([#8][#6](=[#8]))") + if query is None: + return None + matches = mol.GetSubstructMatches(query) + return list(matches[0]) if matches else None + + def calculate_properties(self, smiles: str) -> dict[str, float] | None: + mol = Chem.MolFromSmiles(smiles) + if mol is None: + return None + return { + "molecular_weight": Descriptors.MolWt(mol), + "logp": Crippen.MolLogP(mol), + "qed": QED.qed(mol), + "tpsa": Descriptors.TPSA(mol), + "num_atoms": float(mol.GetNumAtoms()), + "num_heavy_atoms": float(mol.GetNumHeavyAtoms()), + "num_h_donors": float(Lipinski.NumHDonors(mol)), + "num_h_acceptors": float(Lipinski.NumHAcceptors(mol)), + "num_rotatable_bonds": float(Lipinski.NumRotatableBonds(mol)), } diff --git a/src/macro_lactone_toolkit/cli.py b/src/macro_lactone_toolkit/cli.py index da6ff1c..da911be 100644 --- a/src/macro_lactone_toolkit/cli.py +++ b/src/macro_lactone_toolkit/cli.py @@ -48,14 +48,14 @@ def build_parser() -> argparse.ArgumentParser: def run_analyze(args: argparse.Namespace) -> None: analyzer = MacroLactoneAnalyzer() if args.smiles: - payload = analyzer.analyze_molecule(args.smiles) + payload = analyzer.analyze_molecule(args.smiles, ring_size=args.ring_size) _write_output(payload, args.output) return rows = _read_csv_rows(args.input, args.smiles_column, args.id_column) payload = [] for row in rows: - analysis = analyzer.analyze_molecule(row["smiles"]) + analysis = analyzer.analyze_molecule(row["smiles"], ring_size=args.ring_size) analysis["parent_id"] = row["parent_id"] payload.append(analysis) _write_output(payload, args.output) diff --git a/src/macro_lactone_toolkit/fragmenter.py b/src/macro_lactone_toolkit/fragmenter.py index 11fb4f8..b8b348a 100644 --- a/src/macro_lactone_toolkit/fragmenter.py +++ b/src/macro_lactone_toolkit/fragmenter.py @@ -101,11 +101,15 @@ class MacrolactoneFragmenter: ) def _select_candidate(self, mol: Chem.Mol): - candidates = find_macrolactone_candidates(mol, ring_size=self.ring_size) - if not candidates: - requested = f"{self.ring_size}-membered " if self.ring_size is not None else "" - raise MacrolactoneDetectionError(f"No valid {requested}macrolactone was detected.") + classification = self.analyzer.classify_macrocycle(mol, ring_size=self.ring_size) + if classification.classification != "standard_macrolactone": + raise MacrolactoneDetectionError( + "Macrolactone rejected: " + f"classification={classification.classification} " + f"primary_reason_code={classification.primary_reason_code}" + ) + candidates = find_macrolactone_candidates(mol, ring_size=self.ring_size) valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates}) if len(candidates) > 1 or len(valid_ring_sizes) > 1: raise AmbiguousMacrolactoneError( diff --git a/src/macro_lactone_toolkit/models.py b/src/macro_lactone_toolkit/models.py index 36d927c..fdcb948 100644 --- a/src/macro_lactone_toolkit/models.py +++ b/src/macro_lactone_toolkit/models.py @@ -1,6 +1,7 @@ from __future__ import annotations from dataclasses import asdict, dataclass, field +from typing import Literal @dataclass(frozen=True) @@ -50,3 +51,22 @@ class FragmentationResult: "fragments": [fragment.to_dict() for fragment in self.fragments], "warnings": list(self.warnings), } + + +@dataclass(frozen=True) +class MacrocycleClassificationResult: + smiles: str + classification: Literal[ + "standard_macrolactone", + "non_standard_macrocycle", + "not_macrolactone", + ] + ring_size: int | None + primary_reason_code: str | None + primary_reason_message: str | None + all_reason_codes: list[str] = field(default_factory=list) + all_reason_messages: list[str] = field(default_factory=list) + candidate_ring_sizes: list[int] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) diff --git a/src/macro_lactone_toolkit/visualization.py b/src/macro_lactone_toolkit/visualization.py new file mode 100644 index 0000000..6fda8fa --- /dev/null +++ b/src/macro_lactone_toolkit/visualization.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterable + +from rdkit import Chem +from rdkit.Chem.Draw import rdMolDraw2D + +from ._core import build_numbering_result, ensure_mol, find_macrolactone_candidates +from .errors import AmbiguousMacrolactoneError, MacrolactoneDetectionError + + +def numbered_molecule_svg( + mol_input: str | Chem.Mol, + ring_size: int | None = None, + size: tuple[int, int] = (800, 800), + allowed_ring_atom_types: list[str] | None = None, + show_atom_labels: bool = True, +) -> str: + mol, _ = ensure_mol(mol_input) + numbering = _get_visualization_numbering( + mol, + ring_size=ring_size, + allowed_ring_atom_types=allowed_ring_atom_types, + ) + drawer = rdMolDraw2D.MolDraw2DSVG(*size) + _draw_numbered_molecule( + mol=mol, + drawer=drawer, + position_to_atom=numbering.position_to_atom, + show_atom_labels=show_atom_labels, + ) + return drawer.GetDrawingText() + + +def save_numbered_molecule_png( + mol_input: str | Chem.Mol, + output_path: str | Path, + ring_size: int | None = None, + size: tuple[int, int] = (800, 800), + allowed_ring_atom_types: list[str] | None = None, + show_atom_labels: bool = True, + dpi: int = 600, +) -> Path: + del dpi + mol, _ = ensure_mol(mol_input) + numbering = _get_visualization_numbering( + mol, + ring_size=ring_size, + allowed_ring_atom_types=allowed_ring_atom_types, + ) + drawer = rdMolDraw2D.MolDraw2DCairo(*size) + _draw_numbered_molecule( + mol=mol, + drawer=drawer, + position_to_atom=numbering.position_to_atom, + show_atom_labels=show_atom_labels, + ) + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(drawer.GetDrawingText()) + return path + + +def fragment_svg(fragment_or_smiles: str | Chem.Mol, size: tuple[int, int] = (400, 400)) -> str: + mol, _ = ensure_mol(fragment_or_smiles) + drawer = rdMolDraw2D.MolDraw2DSVG(*size) + _draw_plain_molecule(mol, drawer) + return drawer.GetDrawingText() + + +def save_fragment_png( + fragment_or_smiles: str | Chem.Mol, + output_path: str | Path, + size: tuple[int, int] = (400, 400), + dpi: int = 600, +) -> Path: + del dpi + mol, _ = ensure_mol(fragment_or_smiles) + drawer = rdMolDraw2D.MolDraw2DCairo(*size) + _draw_plain_molecule(mol, drawer) + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(drawer.GetDrawingText()) + return path + + +def _draw_numbered_molecule( + mol: Chem.Mol, + drawer: rdMolDraw2D.MolDraw2DSVG | rdMolDraw2D.MolDraw2DCairo, + position_to_atom: dict[int, int], + show_atom_labels: bool, +) -> None: + draw_mol = Chem.Mol(mol) + draw_options = drawer.drawOptions() + if show_atom_labels: + for position, atom_idx in position_to_atom.items(): + draw_options.atomLabels[atom_idx] = str(position) + + highlight_atoms = list(position_to_atom.values()) + highlight_colors = { + atom_idx: (0.96, 0.84, 0.48) + for atom_idx in highlight_atoms + } + rdMolDraw2D.PrepareAndDrawMolecule( + drawer, + draw_mol, + highlightAtoms=highlight_atoms, + highlightAtomColors=highlight_colors, + ) + drawer.FinishDrawing() + + +def _draw_plain_molecule( + mol: Chem.Mol, + drawer: rdMolDraw2D.MolDraw2DSVG | rdMolDraw2D.MolDraw2DCairo, +) -> None: + rdMolDraw2D.PrepareAndDrawMolecule(drawer, Chem.Mol(mol)) + drawer.FinishDrawing() + + +def _get_visualization_numbering( + mol: Chem.Mol, + ring_size: int | None, + allowed_ring_atom_types: list[str] | None, +): + candidates = find_macrolactone_candidates(mol, ring_size=ring_size) + if not candidates: + requested = f"{ring_size}-membered " if ring_size is not None else "" + raise MacrolactoneDetectionError(f"No valid {requested}macrolactone was detected.") + + if allowed_ring_atom_types is not None: + allowed_atomic_numbers = _normalize_allowed_ring_atom_types(allowed_ring_atom_types) + candidates = [ + candidate + for candidate in candidates + if _candidate_matches_allowed_ring_atom_types( + mol, + candidate, + allowed_atomic_numbers, + ) + ] + if not candidates: + raise ValueError("No macrolactone candidate matched the allowed ring atom types.") + + valid_ring_sizes = sorted({candidate.ring_size for candidate in candidates}) + if len(candidates) > 1 or len(valid_ring_sizes) > 1: + raise AmbiguousMacrolactoneError( + "Multiple valid macrolactone candidates were detected. Pass ring_size explicitly." + ) + + return build_numbering_result(mol, candidates[0]) + + +def _normalize_allowed_ring_atom_types(atom_types: Iterable[str]) -> set[int]: + periodic_table = Chem.GetPeriodicTable() + normalized: set[int] = set() + for atom_type in atom_types: + if atom_type.isdigit(): + normalized.add(int(atom_type)) + continue + atomic_number = periodic_table.GetAtomicNumber(atom_type.capitalize()) + if atomic_number <= 0: + raise ValueError(f"Unsupported atom type: {atom_type}") + normalized.add(atomic_number) + return normalized + + +def _candidate_matches_allowed_ring_atom_types( + mol: Chem.Mol, + candidate, + allowed_atomic_numbers: set[int], +) -> bool: + numbering = build_numbering_result(mol, candidate) + for position, atom_idx in numbering.position_to_atom.items(): + if position == 2: + continue + if mol.GetAtomWithIdx(atom_idx).GetAtomicNum() not in allowed_atomic_numbers: + return False + return True diff --git a/src/macro_lactone_toolkit/workflows.py b/src/macro_lactone_toolkit/workflows.py new file mode 100644 index 0000000..9955cfd --- /dev/null +++ b/src/macro_lactone_toolkit/workflows.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd + +from .analyzer import MacroLactoneAnalyzer +from .errors import MacrolactoneError +from .fragmenter import MacrolactoneFragmenter +from .models import FragmentationResult +from .visualization import save_numbered_molecule_png + + +def fragment_csv( + input_csv: str | Path, + smiles_column: str = "smiles", + id_column: str = "id", + ring_size: int | None = None, + max_rows: int | None = None, +) -> list[FragmentationResult]: + results, errors = _fragment_csv_with_errors( + input_csv=input_csv, + smiles_column=smiles_column, + id_column=id_column, + ring_size=ring_size, + max_rows=max_rows, + ) + if errors: + first_error = errors[0] + raise first_error["exception"] + return results + + +def results_to_dataframe(results: list[FragmentationResult]) -> pd.DataFrame: + rows: list[dict] = [] + for result in results: + for fragment in result.fragments: + rows.append( + { + "parent_id": result.parent_id, + "parent_smiles": result.parent_smiles, + "ring_size": result.ring_size, + **fragment.to_dict(), + } + ) + return pd.DataFrame(rows) + + +def write_result_json(result: FragmentationResult, output_path: str | Path) -> Path: + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(result.to_dict(), indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + return path + + +def export_numbered_macrolactone_csv( + input_csv: str | Path, + output_dir: str | Path, + smiles_column: str = "smiles", + id_column: str = "id", + output_csv_name: str = "numbered_macrolactones.csv", + ring_size: int | None = None, + allowed_ring_atom_types: list[str] | None = None, + image_size: tuple[int, int] = (800, 800), + dpi: int = 600, +) -> Path: + analyzer = MacroLactoneAnalyzer() + output_dir = Path(output_dir) + images_dir = output_dir / "images" + output_dir.mkdir(parents=True, exist_ok=True) + images_dir.mkdir(parents=True, exist_ok=True) + + rows = _read_csv_rows( + input_csv=input_csv, + smiles_column=smiles_column, + id_column=id_column, + ) + export_rows: list[dict] = [] + for row in rows: + record = { + "parent_id": row["parent_id"], + "smiles": row["smiles"], + "status": "success", + "image_path": "", + "classification": None, + "primary_reason_code": None, + "ring_size": None, + "candidate_ring_sizes": [], + "error_type": None, + "error_message": None, + } + try: + classification = analyzer.classify_macrocycle(row["smiles"], ring_size=ring_size) + record.update( + { + "classification": classification.classification, + "primary_reason_code": classification.primary_reason_code, + "ring_size": classification.ring_size, + "candidate_ring_sizes": classification.candidate_ring_sizes, + } + ) + image_path = images_dir / f"{row['parent_id']}.png" + save_numbered_molecule_png( + row["smiles"], + image_path, + ring_size=ring_size, + size=image_size, + allowed_ring_atom_types=allowed_ring_atom_types, + dpi=dpi, + ) + record["image_path"] = str(image_path.relative_to(output_dir.parent)) + except Exception as exc: # pragma: no cover - surfaced in CSV + record.update( + { + "status": "error", + "error_type": type(exc).__name__, + "error_message": str(exc), + } + ) + export_rows.append(record) + + output_path = output_dir / output_csv_name + pd.DataFrame(export_rows).to_csv(output_path, index=False) + return output_path + + +def _fragment_csv_with_errors( + input_csv: str | Path, + smiles_column: str = "smiles", + id_column: str = "id", + ring_size: int | None = None, + max_rows: int | None = None, +) -> tuple[list[FragmentationResult], list[dict]]: + fragmenter = MacrolactoneFragmenter(ring_size=ring_size) + rows = _read_csv_rows( + input_csv=input_csv, + smiles_column=smiles_column, + id_column=id_column, + max_rows=max_rows, + ) + + results: list[FragmentationResult] = [] + errors: list[dict] = [] + for row in rows: + try: + results.append(fragmenter.fragment_molecule(row["smiles"], parent_id=row["parent_id"])) + except MacrolactoneError as exc: + errors.append( + { + "parent_id": row["parent_id"], + "smiles": row["smiles"], + "error_type": type(exc).__name__, + "error_message": str(exc), + "exception": exc, + } + ) + return results, errors + + +def _read_csv_rows( + input_csv: str | Path, + smiles_column: str = "smiles", + id_column: str = "id", + max_rows: int | None = None, +) -> list[dict]: + dataframe = pd.read_csv(input_csv) + if max_rows is not None: + dataframe = dataframe.head(max_rows) + + rows = [] + for index, row in dataframe.iterrows(): + parent_id = row[id_column] if id_column in dataframe.columns else f"row_{index}" + rows.append( + { + "parent_id": str(parent_id), + "smiles": row[smiles_column], + } + ) + return rows diff --git a/tests/helpers.py b/tests/helpers.py index 4efbc75..2391d2e 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -16,11 +16,13 @@ class BuiltMacrolactone: def build_macrolactone( ring_size: int, side_chains: Mapping[int, str] | None = None, + ring_atom_symbols: Mapping[int, str] | None = None, ) -> BuiltMacrolactone: if not 12 <= ring_size <= 20: raise ValueError("ring_size must be between 12 and 20") side_chains = dict(side_chains or {}) + ring_atom_symbols = dict(ring_atom_symbols or {}) rwmol = Chem.RWMol() position_to_atom: dict[int, int] = { @@ -28,7 +30,7 @@ def build_macrolactone( 2: rwmol.AddAtom(Chem.Atom("O")), } for position in range(3, ring_size + 1): - position_to_atom[position] = rwmol.AddAtom(Chem.Atom("C")) + position_to_atom[position] = rwmol.AddAtom(Chem.Atom(ring_atom_symbols.get(position, "C"))) carbonyl_oxygen_idx = rwmol.AddAtom(Chem.Atom("O")) @@ -63,6 +65,109 @@ def build_ambiguous_smiles() -> str: return Chem.MolToSmiles(combined, isomericSmiles=True) +def build_non_standard_ring_atom_macrolactone( + ring_size: int = 16, + hetero_position: int = 5, + atom_symbol: str = "N", +) -> BuiltMacrolactone: + if hetero_position < 3 or hetero_position > ring_size: + raise ValueError("hetero_position must be between 3 and ring_size") + return build_macrolactone( + ring_size=ring_size, + ring_atom_symbols={hetero_position: atom_symbol}, + ) + + +def build_overlapping_candidate_macrolactone() -> BuiltMacrolactone: + rwmol = Chem.RWMol() + + atom_labels = ( + "A1", + "A2", + "S1", + "S2", + "S3", + "S4", + "A5", + "A6", + "A7", + "A8", + "A9", + "A10", + "B1", + "B2", + "B5", + "B6", + "B7", + "B8", + "B9", + "B10", + "AO", + "BO", + ) + atom_symbols = { + "A1": "C", + "A2": "O", + "S1": "C", + "S2": "C", + "S3": "C", + "S4": "C", + "A5": "C", + "A6": "C", + "A7": "C", + "A8": "C", + "A9": "C", + "A10": "C", + "B1": "C", + "B2": "O", + "B5": "C", + "B6": "C", + "B7": "C", + "B8": "C", + "B9": "C", + "B10": "C", + "AO": "O", + "BO": "O", + } + atoms = {label: rwmol.AddAtom(Chem.Atom(atom_symbols[label])) for label in atom_labels} + + for atom_a, atom_b in ( + ("A1", "A2"), + ("A2", "S1"), + ("S1", "S2"), + ("S2", "S3"), + ("S3", "S4"), + ("S4", "A5"), + ("A5", "A6"), + ("A6", "A7"), + ("A7", "A8"), + ("A8", "A9"), + ("A9", "A10"), + ("A10", "A1"), + ("B1", "B2"), + ("B2", "S1"), + ("S4", "B5"), + ("B5", "B6"), + ("B6", "B7"), + ("B7", "B8"), + ("B8", "B9"), + ("B9", "B10"), + ("B10", "B1"), + ): + rwmol.AddBond(atoms[atom_a], atoms[atom_b], Chem.BondType.SINGLE) + + rwmol.AddBond(atoms["A1"], atoms["AO"], Chem.BondType.DOUBLE) + rwmol.AddBond(atoms["B1"], atoms["BO"], Chem.BondType.DOUBLE) + + mol = rwmol.GetMol() + Chem.SanitizeMol(mol) + return BuiltMacrolactone( + mol=mol, + smiles=Chem.MolToSmiles(mol, isomericSmiles=True), + position_to_atom={}, + ) + + def canonicalize(smiles_or_mol: str | Chem.Mol) -> str: if isinstance(smiles_or_mol, Chem.Mol): mol = smiles_or_mol diff --git a/tests/test_cli.py b/tests/test_cli.py index 9b047db..f771d0d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -6,7 +6,12 @@ import sys import pandas as pd -from .helpers import build_ambiguous_smiles, build_macrolactone +from .helpers import ( + build_ambiguous_smiles, + build_macrolactone, + build_non_standard_ring_atom_macrolactone, + build_overlapping_candidate_macrolactone, +) def run_cli(*args: str) -> subprocess.CompletedProcess[str]: @@ -24,7 +29,10 @@ def test_cli_smoke_commands(): analyze = run_cli("analyze", "--smiles", built.smiles) assert analyze.returncode == 0, analyze.stderr analyze_payload = json.loads(analyze.stdout) - assert analyze_payload["valid_ring_sizes"] == [16] + assert analyze_payload["classification"] == "standard_macrolactone" + assert analyze_payload["ring_size"] == 16 + assert analyze_payload["primary_reason_code"] is None + assert analyze_payload["candidate_ring_sizes"] == [16] number = run_cli("number", "--smiles", built.smiles) assert number.returncode == 0, number.stderr @@ -40,6 +48,55 @@ def test_cli_smoke_commands(): assert fragment_payload["fragments"][0]["fragment_smiles_labeled"] +def test_cli_analyze_reports_non_standard_classifications(): + hetero = build_non_standard_ring_atom_macrolactone() + overlap = build_overlapping_candidate_macrolactone() + + hetero_result = run_cli("analyze", "--smiles", hetero.smiles) + assert hetero_result.returncode == 0, hetero_result.stderr + hetero_payload = json.loads(hetero_result.stdout) + assert hetero_payload["classification"] == "non_standard_macrocycle" + assert hetero_payload["primary_reason_code"] == "contains_non_carbon_ring_atoms_outside_positions_1_2" + assert hetero_payload["ring_size"] == 16 + + overlap_result = run_cli("analyze", "--smiles", overlap.smiles) + assert overlap_result.returncode == 0, overlap_result.stderr + overlap_payload = json.loads(overlap_result.stdout) + assert overlap_payload["classification"] == "non_standard_macrocycle" + assert overlap_payload["primary_reason_code"] == "multiple_overlapping_macrocycle_candidates" + assert overlap_payload["ring_size"] == 12 + + +def test_cli_analyze_csv_reports_classification_fields(tmp_path): + valid = build_macrolactone(14) + hetero = build_non_standard_ring_atom_macrolactone() + input_path = tmp_path / "molecules.csv" + output_path = tmp_path / "analysis.csv" + + pd.DataFrame( + [ + {"id": "valid_1", "smiles": valid.smiles}, + {"id": "hetero_1", "smiles": hetero.smiles}, + ] + ).to_csv(input_path, index=False) + + completed = run_cli( + "analyze", + "--input", + str(input_path), + "--output", + str(output_path), + ) + + assert completed.returncode == 0, completed.stderr + + analysis = pd.read_csv(output_path) + assert set(analysis["parent_id"]) == {"valid_1", "hetero_1"} + assert set(analysis["classification"]) == {"standard_macrolactone", "non_standard_macrocycle"} + assert "primary_reason_code" in analysis.columns + assert "ring_size" in analysis.columns + + def test_cli_fragment_csv_skips_ambiguous_and_records_errors(tmp_path): valid = build_macrolactone(14, {4: "methyl"}) ambiguous = build_ambiguous_smiles() diff --git a/tests/test_detection_and_numbering.py b/tests/test_detection_and_numbering.py index 1cbc4a8..f3b4c4a 100644 --- a/tests/test_detection_and_numbering.py +++ b/tests/test_detection_and_numbering.py @@ -8,7 +8,12 @@ from macro_lactone_toolkit import ( MacrolactoneFragmenter, ) -from .helpers import build_ambiguous_smiles, build_macrolactone +from .helpers import ( + build_ambiguous_smiles, + build_macrolactone, + build_non_standard_ring_atom_macrolactone, + build_overlapping_candidate_macrolactone, +) @pytest.mark.parametrize("ring_size", [12, 14, 16, 20]) @@ -25,6 +30,77 @@ def test_analyzer_rejects_non_lactone_macrocycle(): assert analyzer.get_valid_ring_sizes("C1CCCCCCCCCCC1") == [] +@pytest.mark.parametrize("ring_size", [12, 14, 16, 20]) +def test_analyzer_classifies_supported_ring_sizes(ring_size: int): + built = build_macrolactone(ring_size) + analyzer = MacroLactoneAnalyzer() + + result = analyzer.classify_macrocycle(built.smiles) + + assert result.classification == "standard_macrolactone" + assert result.ring_size == ring_size + assert result.primary_reason_code is None + assert result.primary_reason_message is None + assert result.all_reason_codes == [] + assert result.all_reason_messages == [] + assert result.candidate_ring_sizes == [ring_size] + + +def test_analyzer_classifies_ring_heteroatom_as_non_standard(): + built = build_non_standard_ring_atom_macrolactone() + analyzer = MacroLactoneAnalyzer() + + result = analyzer.classify_macrocycle(built.smiles) + + assert result.classification == "non_standard_macrocycle" + assert result.ring_size == 16 + assert result.primary_reason_code == "contains_non_carbon_ring_atoms_outside_positions_1_2" + assert result.primary_reason_message == "Ring positions 3..N contain non-carbon atoms." + assert result.all_reason_codes == ["contains_non_carbon_ring_atoms_outside_positions_1_2"] + assert result.candidate_ring_sizes == [16] + + +def test_analyzer_classifies_overlapping_candidates_as_non_standard(): + built = build_overlapping_candidate_macrolactone() + analyzer = MacroLactoneAnalyzer() + + result = analyzer.classify_macrocycle(built.smiles) + + assert result.classification == "non_standard_macrocycle" + assert result.ring_size == 12 + assert result.primary_reason_code == "multiple_overlapping_macrocycle_candidates" + assert result.primary_reason_message == "Overlapping macrolactone candidate rings were detected." + assert result.all_reason_codes == ["multiple_overlapping_macrocycle_candidates"] + assert result.candidate_ring_sizes == [12] + + +def test_analyzer_classifies_non_lactone_macrocycle(): + analyzer = MacroLactoneAnalyzer() + + result = analyzer.classify_macrocycle("C1CCCCCCCCCCC1") + + assert result.classification == "not_macrolactone" + assert result.ring_size is None + assert result.primary_reason_code == "no_lactone_ring_in_12_to_20_range" + assert result.primary_reason_message == "No 12-20 membered lactone ring was detected." + assert result.all_reason_codes == ["no_lactone_ring_in_12_to_20_range"] + assert result.candidate_ring_sizes == [] + + +def test_analyzer_explicit_ring_size_miss_returns_requested_ring_not_found(): + built = build_macrolactone(12) + analyzer = MacroLactoneAnalyzer() + + result = analyzer.classify_macrocycle(built.smiles, ring_size=16) + + assert result.classification == "not_macrolactone" + assert result.ring_size is None + assert result.primary_reason_code == "requested_ring_size_not_found" + assert result.primary_reason_message == "The requested ring size was not detected as a lactone ring." + assert result.all_reason_codes == ["requested_ring_size_not_found"] + assert result.candidate_ring_sizes == [] + + def test_fragmenter_auto_numbers_ring_with_expected_positions(): built = build_macrolactone(16, {5: "methyl"}) result = MacrolactoneFragmenter().number_molecule(built.mol) @@ -55,10 +131,35 @@ def test_fragmenter_requires_explicit_ring_size_for_ambiguous_molecule(): def test_fragmenter_raises_for_missing_macrolactone(): - with pytest.raises(MacrolactoneDetectionError): + with pytest.raises( + MacrolactoneDetectionError, + match="classification=not_macrolactone primary_reason_code=no_lactone_ring_in_12_to_20_range", + ): MacrolactoneFragmenter().number_molecule("CCO") +def test_fragmenter_rejects_non_standard_macrocycle_with_reason_code(): + built = build_non_standard_ring_atom_macrolactone() + + with pytest.raises( + MacrolactoneDetectionError, + match="classification=non_standard_macrocycle " + "primary_reason_code=contains_non_carbon_ring_atoms_outside_positions_1_2", + ): + MacrolactoneFragmenter().number_molecule(built.smiles) + + +def test_fragmenter_rejects_non_standard_macrocycle_during_fragmentation(): + built = build_overlapping_candidate_macrolactone() + + with pytest.raises( + MacrolactoneDetectionError, + match="classification=non_standard_macrocycle " + "primary_reason_code=multiple_overlapping_macrocycle_candidates", + ): + MacrolactoneFragmenter().fragment_molecule(built.smiles) + + def test_explicit_ring_size_selects_requested_ring(): built = build_macrolactone(14) result = MacrolactoneFragmenter(ring_size=14).number_molecule(built.smiles) diff --git a/tests/test_scripts_and_docs.py b/tests/test_scripts_and_docs.py new file mode 100644 index 0000000..c863ed0 --- /dev/null +++ b/tests/test_scripts_and_docs.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pandas as pd + +from macro_lactone_toolkit import MacrolactoneFragmenter + +from .helpers import build_ambiguous_smiles, build_macrolactone + + +PROJECT_ROOT = Path(__file__).resolve().parents[1] +ACTIVE_TEXT_ASSETS = [ + PROJECT_ROOT / "scripts" / "README.md", + PROJECT_ROOT / "docs" / "SUMMARY.md", + PROJECT_ROOT / "docs" / "project-docs" / "QUICK_COMMANDS.md", + PROJECT_ROOT / "notebooks" / "README_analyze_ring16.md", +] + + +def run_script(script_name: str, *args: str) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, str(PROJECT_ROOT / "scripts" / script_name), *args], + capture_output=True, + text=True, + check=False, + cwd=PROJECT_ROOT, + ) + + +def test_batch_process_script_writes_flat_outputs_and_summary(tmp_path): + valid = build_macrolactone(14, {4: "methyl"}) + ambiguous = build_ambiguous_smiles() + input_path = tmp_path / "molecules.csv" + output_path = tmp_path / "fragments.csv" + errors_path = tmp_path / "errors.csv" + summary_path = tmp_path / "summary.json" + + pd.DataFrame( + [ + {"id": "valid_1", "smiles": valid.smiles}, + {"id": "ambiguous_1", "smiles": ambiguous}, + ] + ).to_csv(input_path, index=False) + + completed = run_script( + "batch_process.py", + "--input", + str(input_path), + "--output", + str(output_path), + "--errors-output", + str(errors_path), + "--summary-output", + str(summary_path), + ) + + assert completed.returncode == 0, completed.stderr + assert output_path.exists() + assert errors_path.exists() + assert summary_path.exists() + + summary = json.loads(summary_path.read_text(encoding="utf-8")) + assert summary["processed"] == 2 + assert summary["successful"] == 1 + assert summary["failed"] == 1 + + +def test_analyze_fragments_script_generates_reports_and_plot(tmp_path): + built = build_macrolactone(16, {5: "methyl", 7: "ethyl"}) + result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="analysis_1") + fragments = pd.DataFrame( + [ + { + "parent_id": result.parent_id, + "parent_smiles": result.parent_smiles, + "ring_size": result.ring_size, + **fragment.to_dict(), + } + for fragment in result.fragments + ] + ) + input_path = tmp_path / "fragments.csv" + output_dir = tmp_path / "analysis" + fragments.to_csv(input_path, index=False) + + completed = run_script( + "analyze_fragments.py", + "--input", + str(input_path), + "--output-dir", + str(output_dir), + ) + + assert completed.returncode == 0, completed.stderr + assert (output_dir / "position_statistics.csv").exists() + assert (output_dir / "fragment_property_summary.csv").exists() + assert (output_dir / "position_frequencies.png").exists() + assert (output_dir / "analysis_summary.txt").exists() + + +def test_generate_sdf_and_statistics_script_generates_artifacts(tmp_path): + built = build_macrolactone(16, {5: "methyl"}) + result = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="sdf_1") + fragments = pd.DataFrame( + [ + { + "parent_id": result.parent_id, + "parent_smiles": result.parent_smiles, + "ring_size": result.ring_size, + **fragment.to_dict(), + } + for fragment in result.fragments + ] + ) + input_path = tmp_path / "fragments.csv" + output_dir = tmp_path / "sdf_output" + fragments.to_csv(input_path, index=False) + + completed = run_script( + "generate_sdf_and_statistics.py", + "--input", + str(input_path), + "--output-dir", + str(output_dir), + ) + + assert completed.returncode == 0, completed.stderr + assert (output_dir / "cleavage_position_statistics.json").exists() + assert (output_dir / "sdf" / "sdf_1_3d.sdf").exists() + + +def test_active_text_assets_do_not_reference_legacy_api(): + forbidden_patterns = [ + "from src.", + "import src.", + "process_csv(", + "batch_to_dataframe(", + "visualize_molecule(", + "save_to_json(", + ] + + for path in ACTIVE_TEXT_ASSETS: + text = path.read_text(encoding="utf-8") + for pattern in forbidden_patterns: + assert pattern not in text, f"{path} still contains legacy reference: {pattern}" diff --git a/tests/test_visualization_and_workflows.py b/tests/test_visualization_and_workflows.py new file mode 100644 index 0000000..9b3eaba --- /dev/null +++ b/tests/test_visualization_and_workflows.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import json + +import pandas as pd +import pytest + +from macro_lactone_toolkit import MacroLactoneAnalyzer, MacrolactoneFragmenter + +from .helpers import ( + build_ambiguous_smiles, + build_macrolactone, + build_non_standard_ring_atom_macrolactone, +) + + +def test_visualization_exports_numbered_svg_and_png(tmp_path): + from macro_lactone_toolkit.visualization import ( + numbered_molecule_svg, + save_fragment_png, + save_numbered_molecule_png, + ) + + built = build_macrolactone(16, {5: "methyl"}) + fragment = MacrolactoneFragmenter().fragment_molecule(built.smiles, parent_id="viz_1").fragments[0] + + svg = numbered_molecule_svg(built.smiles) + assert " 0 + + fragment_path = tmp_path / "fragment.png" + returned_fragment_path = save_fragment_png(fragment.fragment_smiles_labeled, fragment_path) + assert returned_fragment_path == fragment_path + assert fragment_path.exists() + assert fragment_path.stat().st_size > 0 + + +def test_visualization_supports_allowed_ring_atom_type_filtering(): + from macro_lactone_toolkit.visualization import numbered_molecule_svg + + hetero = build_non_standard_ring_atom_macrolactone() + + svg = numbered_molecule_svg(hetero.smiles, allowed_ring_atom_types=["C", "N"]) + assert "