继续中断任务继续运行

This commit is contained in:
mm644706215
2025-07-31 13:18:32 +08:00
parent ca7257ffa6
commit 3c25bc619a

View File

@@ -1,37 +1,3 @@
"""
--------------------------------------------------------------
并行批量SMILES转3D SDF文件脚本
--------------------------------------------------------------
本脚本用于从CSV文件中批量读取SMILES分子式利用RDKit并行生成3D分子构象
每个分子单独保存为SDF文件并输出详细日志和失败统计。
主要功能:
- 支持通过参数动态指定SMILES和标识符字段名
- 多进程加速可设定n_jobs
- 详细彩色日志输出rich失败原因自动分类
- 成功/失败统计表格
- 所有失败分子及原因自动记录至failed_smiles.txt
依赖:
conda install -y -c conda-forge rdkit pandas rich joblib
or pip install pandas rich
推荐用法示例:
python gen_sdf_parallel.py --csv input.csv --outdir ./sdf_files \
--smiles_col canonical_smiles --id_col identifier --n_jobs 8 --max_attempts 100
参数说明:
--csv 输入的CSV文件路径必填
--outdir 输出SDF文件夹默认 ./sdf_files
--smiles_col SMILES列名默认 canonical_smiles
--id_col 标识符列名,默认 identifier
--n_jobs 并行进程数默认4
--max_attempts 3D构象最大生成尝试次数默认10
脚本作者lingyuzeng
最后更新2025-07
--------------------------------------------------------------
"""
from pathlib import Path
import pandas as pd
from rdkit import Chem
@@ -41,9 +7,18 @@ from concurrent.futures import ProcessPoolExecutor, as_completed
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
# rdkit, tqdm, rich,pandas
console = Console()
def is_valid_sdf(sdf_path):
try:
# 尝试读取SDF文件为mol对象
suppl = Chem.SDMolSupplier(str(sdf_path), sanitize=False)
mols = [mol for mol in suppl if mol is not None]
return len(mols) > 0
except Exception:
return False
def smiles_to_3d_sdf(identifier, smiles, props, sdf_path, max_attempts=10):
try:
mol = Chem.MolFromSmiles(smiles)
@@ -79,29 +54,44 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job
output_dir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(csv_path, sep=',', dtype=str)
tasks = []
skipped = []
for idx, row in df.iterrows():
smiles = row[smiles_col]
identifier = row[id_col]
props = row.to_dict()
sdf_file = output_dir / f"{identifier}.sdf"
if sdf_file.exists():
if is_valid_sdf(sdf_file):
# SDF存在且可读跳过
skipped.append(identifier)
continue
else:
# SDF存在但不可读认为损坏先删除
try:
sdf_file.unlink()
console.print(f"[red]⚡发现损坏SDF文件 {sdf_file.name},已删除,准备重新生成[/red]")
except Exception as e:
console.print(f"[bold magenta]❗无法删除损坏SDF: {sdf_file.name}, {e}[/]")
tasks.append((identifier, smiles, props, sdf_file, max_attempts))
console.rule(f"[bold green]共 {len(tasks)} 个分子,使用 {n_jobs} 并行进程生成[/]")
console.rule(f"[bold green]共 {len(df)} 个分子,{len(skipped)} 个已存在且有效,{len(tasks)} 个待处理(使用 {n_jobs} 并行进程)[/]")
results = []
with ProcessPoolExecutor(max_workers=n_jobs) as executor:
future_to_identifier = {executor.submit(smiles_to_3d_sdf_tuple, task): task[0] for task in tasks}
for i, future in enumerate(as_completed(future_to_identifier), 1):
identifier, success, msg = future.result()
results.append((identifier, success, msg))
if success:
console.print(f"[bold green]✅ [{identifier}] 处理成功。[/][dim]{msg}[/]")
else:
if "SMILES解析失败" in msg:
console.print(f"[bold red]❌ [{identifier}] SMILES解析失败: {msg}[/]")
elif "3D" in msg:
console.print(f"[yellow]⚠️ [{identifier}] 3D构象生成失败: {msg}[/]")
if tasks:
with ProcessPoolExecutor(max_workers=n_jobs) as executor:
future_to_identifier = {executor.submit(smiles_to_3d_sdf_tuple, task): task[0] for task in tasks}
for i, future in enumerate(as_completed(future_to_identifier), 1):
identifier, success, msg = future.result()
results.append((identifier, success, msg))
if success:
console.print(f"[bold green]✅ [{identifier}] 处理成功。[/][dim]{msg}[/]")
else:
console.print(f"[magenta]❗ [{identifier}] 其它错误: {msg}[/]")
if "SMILES解析失败" in msg:
console.print(f"[bold red]❌ [{identifier}] SMILES解析失败: {msg}[/]")
elif "3D" in msg:
console.print(f"[yellow]⚠️ [{identifier}] 3D构象生成失败: {msg}[/]")
else:
console.print(f"[magenta]❗ [{identifier}] 其它错误: {msg}[/]")
# 分类失败原因
failed = [r for r in results if not r[1]]
@@ -115,6 +105,7 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job
table.add_column("状态", justify="center", style="cyan")
table.add_column("数量", justify="center")
table.add_row("成功", str(len(succeed)))
table.add_row("已跳过已存在有效SDF", str(len(skipped)))
table.add_row("SMILES解析失败", str(len(failed_smiles)))
table.add_row("3D构象失败", str(len(failed_3d)))
table.add_row("其它失败", str(len(failed_other)))
@@ -159,7 +150,7 @@ def batch_csv_to_3d_sdf_parallel(csv_path, output_dir, smiles_col, id_col, n_job
)
)
else:
console.print(Panel("[bold green]全部分子处理成功![/bold green]", style="green"))
console.print(Panel("[bold green]全部分子处理成功或已跳过[/bold green]", style="green"))
if __name__ == '__main__':
import argparse
@@ -168,7 +159,7 @@ if __name__ == '__main__':
parser.add_argument('--outdir', type=str, default='./sdf_files', help='SDF输出文件夹')
parser.add_argument('--smiles_col', type=str, default='canonical_smiles', help='SMILES列名')
parser.add_argument('--id_col', type=str, default='identifier', help='标识符列名')
parser.add_argument('--n_jobs', type=int, default=-1, help='并行进程数')
parser.add_argument('--n_jobs', type=int, default=4, help='并行进程数')
parser.add_argument('--max_attempts', type=int, default=10, help='最大尝试次数')
args = parser.parse_args()
@@ -176,5 +167,5 @@ if __name__ == '__main__':
args.csv, args.outdir, args.smiles_col, args.id_col,
n_jobs=args.n_jobs, max_attempts=args.max_attempts
)
# python gen_sdf_parallel.py --csv ./data/coconut_data_info.csv --outdir ./data/sdf_files --n_jobs 8 --max_attempts 100 --smiles_col canonical_smiles --id_col identifier
# use example:
# python gen_sdf_parallel.py --csv coconut_data_info.csv --outdir ./sdf_files --n_jobs 8 --max_attempts 10 --smiles_col canonical_smiles --id_col identifier