From 3c25bc619a81231f43b5ab50f458a2d58b287ed3 Mon Sep 17 00:00:00 2001 From: mm644706215 Date: Thu, 31 Jul 2025 13:18:32 +0800 Subject: [PATCH] =?UTF-8?q?=E7=BB=A7=E7=BB=AD=E4=B8=AD=E6=96=AD=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1=E7=BB=A7=E7=BB=AD=E8=BF=90=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/gen_sdf_parallel.py | 97 +++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/scripts/gen_sdf_parallel.py b/scripts/gen_sdf_parallel.py index b9c0c96..a9e2baf 100644 --- a/scripts/gen_sdf_parallel.py +++ b/scripts/gen_sdf_parallel.py @@ -1,37 +1,3 @@ -""" --------------------------------------------------------------- -并行批量SMILES转3D SDF文件脚本 --------------------------------------------------------------- -本脚本用于从CSV文件中批量读取SMILES分子式,利用RDKit并行生成3D分子构象, -每个分子单独保存为SDF文件,并输出详细日志和失败统计。 - -主要功能: -- 支持通过参数动态指定SMILES和标识符字段名 -- 多进程加速(可设定n_jobs) -- 详细彩色日志输出(rich),失败原因自动分类 -- 成功/失败统计表格 -- 所有失败分子及原因自动记录至failed_smiles.txt - -依赖: - conda install -y -c conda-forge rdkit pandas rich joblib - or pip install pandas rich - -推荐用法示例: - python gen_sdf_parallel.py --csv input.csv --outdir ./sdf_files \ - --smiles_col canonical_smiles --id_col identifier --n_jobs 8 --max_attempts 100 - -参数说明: - --csv 输入的CSV文件路径(必填) - --outdir 输出SDF文件夹,默认 ./sdf_files - --smiles_col SMILES列名,默认 canonical_smiles - --id_col 标识符列名,默认 identifier - --n_jobs 并行进程数,默认4 - --max_attempts 3D构象最大生成尝试次数,默认10 - -脚本作者:lingyuzeng -最后更新:2025-07 --------------------------------------------------------------- -""" from pathlib import Path import pandas as pd from rdkit import Chem @@ -41,9 +7,18 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from rich.console import Console from rich.table import Table from rich.panel import Panel -# rdkit, tqdm, rich,pandas + console = Console() +def is_valid_sdf(sdf_path): + try: + # 尝试读取SDF文件为mol对象 + suppl = Chem.SDMolSupplier(str(sdf_path), sanitize=False) + mols = [mol for mol in suppl if mol is not None] + return len(mols) > 0 + except Exception: + return False + def smiles_to_3d_sdf(identifier, smiles, props, sdf_path, max_attempts=10): try: mol = Chem.MolFromSmiles(smiles) @@ -79,29 +54,44 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job output_dir.mkdir(parents=True, exist_ok=True) df = pd.read_csv(csv_path, sep=',', dtype=str) tasks = [] + skipped = [] for idx, row in df.iterrows(): smiles = row[smiles_col] identifier = row[id_col] props = row.to_dict() sdf_file = output_dir / f"{identifier}.sdf" + if sdf_file.exists(): + if is_valid_sdf(sdf_file): + # SDF存在且可读,跳过 + skipped.append(identifier) + continue + else: + # SDF存在但不可读,认为损坏,先删除 + try: + sdf_file.unlink() + console.print(f"[red]⚡发现损坏SDF文件 {sdf_file.name},已删除,准备重新生成[/red]") + except Exception as e: + console.print(f"[bold magenta]❗无法删除损坏SDF: {sdf_file.name}, {e}[/]") tasks.append((identifier, smiles, props, sdf_file, max_attempts)) - console.rule(f"[bold green]共 {len(tasks)} 个分子,使用 {n_jobs} 并行进程生成[/]") + + console.rule(f"[bold green]共 {len(df)} 个分子,{len(skipped)} 个已存在且有效,{len(tasks)} 个待处理(使用 {n_jobs} 并行进程)[/]") results = [] - with ProcessPoolExecutor(max_workers=n_jobs) as executor: - future_to_identifier = {executor.submit(smiles_to_3d_sdf_tuple, task): task[0] for task in tasks} - for i, future in enumerate(as_completed(future_to_identifier), 1): - identifier, success, msg = future.result() - results.append((identifier, success, msg)) - if success: - console.print(f"[bold green]✅ [{identifier}] 处理成功。[/][dim]{msg}[/]") - else: - if "SMILES解析失败" in msg: - console.print(f"[bold red]❌ [{identifier}] SMILES解析失败: {msg}[/]") - elif "3D" in msg: - console.print(f"[yellow]⚠️ [{identifier}] 3D构象生成失败: {msg}[/]") + if tasks: + with ProcessPoolExecutor(max_workers=n_jobs) as executor: + future_to_identifier = {executor.submit(smiles_to_3d_sdf_tuple, task): task[0] for task in tasks} + for i, future in enumerate(as_completed(future_to_identifier), 1): + identifier, success, msg = future.result() + results.append((identifier, success, msg)) + if success: + console.print(f"[bold green]✅ [{identifier}] 处理成功。[/][dim]{msg}[/]") else: - console.print(f"[magenta]❗ [{identifier}] 其它错误: {msg}[/]") + if "SMILES解析失败" in msg: + console.print(f"[bold red]❌ [{identifier}] SMILES解析失败: {msg}[/]") + elif "3D" in msg: + console.print(f"[yellow]⚠️ [{identifier}] 3D构象生成失败: {msg}[/]") + else: + console.print(f"[magenta]❗ [{identifier}] 其它错误: {msg}[/]") # 分类失败原因 failed = [r for r in results if not r[1]] @@ -115,6 +105,7 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job table.add_column("状态", justify="center", style="cyan") table.add_column("数量", justify="center") table.add_row("成功", str(len(succeed))) + table.add_row("已跳过(已存在有效SDF)", str(len(skipped))) table.add_row("SMILES解析失败", str(len(failed_smiles))) table.add_row("3D构象失败", str(len(failed_3d))) table.add_row("其它失败", str(len(failed_other))) @@ -159,7 +150,7 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job ) ) else: - console.print(Panel("[bold green]全部分子处理成功![/bold green]", style="green")) + console.print(Panel("[bold green]全部分子处理成功或已跳过![/bold green]", style="green")) if __name__ == '__main__': import argparse @@ -168,7 +159,7 @@ if __name__ == '__main__': parser.add_argument('--outdir', type=str, default='./sdf_files', help='SDF输出文件夹') parser.add_argument('--smiles_col', type=str, default='canonical_smiles', help='SMILES列名') parser.add_argument('--id_col', type=str, default='identifier', help='标识符列名') - parser.add_argument('--n_jobs', type=int, default=-1, help='并行进程数') + parser.add_argument('--n_jobs', type=int, default=4, help='并行进程数') parser.add_argument('--max_attempts', type=int, default=10, help='最大尝试次数') args = parser.parse_args() @@ -176,5 +167,5 @@ if __name__ == '__main__': args.csv, args.outdir, args.smiles_col, args.id_col, n_jobs=args.n_jobs, max_attempts=args.max_attempts ) - - # python gen_sdf_parallel.py --csv ./data/coconut_data_info.csv --outdir ./data/sdf_files --n_jobs 8 --max_attempts 100 --smiles_col canonical_smiles --id_col identifier \ No newline at end of file + # use example: + # python gen_sdf_parallel.py --csv coconut_data_info.csv --outdir ./sdf_files --n_jobs 8 --max_attempts 10 --smiles_col canonical_smiles --id_col identifier \ No newline at end of file