Files
vina_docking_batch/sdf2to3d.py
lingyuzeng 05ce8823f8 first add
2025-08-02 21:54:31 +08:00

96 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
-------------------------------------------------------------
批量2D SDF文件 → 3D SDF文件 并行转换脚本 (joblib版)
-------------------------------------------------------------
功能:
- 从指定目录读取2D SDF文件每个分子一个SDF文件并行批量生成3D SDF文件保留原有属性
- 支持失败分子详细记录和统计
- 源文件与目标3D SDF目录分离防止覆盖
依赖安装:
conda install -y -c conda-forge rdkit joblib
用法示例:
python sdf2to3d.py --src_dir ./2d_sdf_dir --out_dir ./3d_sdf_dir --n_jobs 8
作者chatgpt
-------------------------------------------------------------
"""
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Parallel, delayed
import argparse
def convert_2d_to_3d_sdf(sdf_path, out_dir, max_attempts=10):
identifier = sdf_path.stem
out_sdf = out_dir / f"{identifier}.sdf"
try:
suppl = Chem.SDMolSupplier(str(sdf_path), sanitize=True)
mols = [mol for mol in suppl if mol is not None]
if not mols:
return identifier, False, "SDF读取失败"
mol = mols[0]
mol = Chem.AddHs(mol)
params = AllChem.ETKDGv3()
last_error = ""
for attempt in range(max_attempts):
try:
status = AllChem.EmbedMolecule(mol, params)
if status == 0:
AllChem.UFFOptimizeMolecule(mol)
writer = Chem.SDWriter(str(out_sdf))
writer.write(mol)
writer.close()
return identifier, True, f"成功(第{attempt+1}次)"
else:
last_error = f"Embed失败: status={status}"
except Exception as e:
last_error = f"3D生成异常: {e}"
return identifier, False, last_error if last_error else f"3D构象生成失败已重试{max_attempts}次)"
except Exception as e:
return identifier, False, f"异常: {e}"
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--src_dir', type=str, required=True, help='2D SDF文件夹路径')
parser.add_argument('--out_dir', type=str, required=True, help='3D SDF输出文件夹路径')
parser.add_argument('--n_jobs', type=int, default=4, help='并行进程数')
parser.add_argument('--max_attempts', type=int, default=10, help='最大Embed尝试次数')
args = parser.parse_args()
src_dir = Path(args.src_dir)
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
sdf_files = list(src_dir.glob("*.sdf"))
print(f"共检测到2D SDF文件 {len(sdf_files)}开始并行3D生成...")
results = Parallel(n_jobs=args.n_jobs)(
delayed(convert_2d_to_3d_sdf)(sdf_file, out_dir, args.max_attempts) for sdf_file in sdf_files
)
# 分类统计
success = [r for r in results if r[1]]
failed = [r for r in results if not r[1]]
print("="*60)
print(f"总2D SDF: {len(sdf_files)}")
print(f"成功生成3D SDF: {len(success)}")
print(f"失败: {len(failed)}")
if failed:
print("失败分子列表:")
for ident, _, msg in failed:
print(f" {ident}: {msg}")
# 保存失败到文件
failfile = out_dir / "failed_2dto3d.txt"
with open(failfile, "w", encoding="utf-8") as f:
for ident, _, msg in failed:
f.write(f"{ident}\t{msg}\n")
print(f"失败分子已保存到: {failfile.resolve()}")
print("="*60)
if __name__ == '__main__':
main()