add train qsar model script
This commit is contained in:
182
pycomsia/src/generate_3d_structures.py
Normal file
182
pycomsia/src/generate_3d_structures.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import os
|
||||
import subprocess
|
||||
import multiprocessing as mp
|
||||
import pandas as pd
|
||||
import time
|
||||
from pathlib import Path
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem
|
||||
|
||||
def generate_rdkit_sdf(csv_file):
|
||||
"""
|
||||
读取 CSV 中的所有字段,
|
||||
利用 RDKit 生成分子对象(2D,添加氢并嵌入一个初步构象),
|
||||
并保存为 SDF 文件到当前目录下的 “rdkit_output” 文件夹中,
|
||||
文件名为 <Compound>.sdf,同时在分子属性中写入 CSV 中所有信息。
|
||||
|
||||
返回一个列表,每个元素为 (compound, smi, rdkit_sdf_path)。
|
||||
"""
|
||||
data = pd.read_csv(csv_file)
|
||||
required_cols = {"SMILES", "Compound"}
|
||||
if not required_cols.issubset(set(data.columns)):
|
||||
raise ValueError("CSV 文件必须包含 'SMILES' 和 'Compound' 列。")
|
||||
|
||||
rdkit_folder = Path.cwd() / "rdkit_output"
|
||||
rdkit_folder.mkdir(exist_ok=True)
|
||||
|
||||
file_list = []
|
||||
for _, row in data.iterrows():
|
||||
smi = row["SMILES"]
|
||||
compound = str(row["Compound"])
|
||||
mol = Chem.MolFromSmiles(smi)
|
||||
if mol is None:
|
||||
print(f"Warning: 无法从 SMILES {smi} 生成分子对象。")
|
||||
continue
|
||||
|
||||
# 遍历 CSV 这一行的所有列,添加到分子属性中
|
||||
for key, value in row.items():
|
||||
mol.SetProp(str(key), str(value))
|
||||
|
||||
# 添加氢并生成初步构象(2D)
|
||||
mol = Chem.AddHs(mol)
|
||||
AllChem.EmbedMolecule(mol)
|
||||
|
||||
out_file = rdkit_folder / f"{compound}.sdf"
|
||||
writer = Chem.SDWriter(str(out_file))
|
||||
writer.write(mol)
|
||||
writer.close()
|
||||
file_list.append((compound, smi, out_file))
|
||||
return file_list
|
||||
|
||||
def fallback_rdkit_generate(compound, smi):
|
||||
"""
|
||||
使用 RDKit 的 ETKDGv3 方法尝试生成 3D 构象,并将生成的构象保存到 balloon_output 文件夹中,
|
||||
文件名为 fallback_<compound>.sdf,同时打印日志。
|
||||
"""
|
||||
mol = Chem.MolFromSmiles(smi)
|
||||
if mol is None:
|
||||
print(f"Warning: fallback 无法从 SMILES {smi} 生成分子对象。")
|
||||
return None
|
||||
# 添加 CSV 中所有属性,这里至少添加 Compound 信息
|
||||
mol.SetProp("Compound", compound)
|
||||
|
||||
mol = Chem.AddHs(mol)
|
||||
params = AllChem.ETKDGv3()
|
||||
params.randomSeed = 10
|
||||
params.useMacrocycleTorsions = True
|
||||
params.maxAttempts = 10000
|
||||
_ = AllChem.EmbedMultipleConfs(mol, numConfs=5, params=params)
|
||||
if mol.GetNumConformers() == 0:
|
||||
print(f"Warning: fallback 分子 {smi} 未生成构象。")
|
||||
return None
|
||||
energies = []
|
||||
for conf_id in range(mol.GetNumConformers()):
|
||||
props = AllChem.MMFFGetMoleculeProperties(mol)
|
||||
ff = AllChem.MMFFGetMoleculeForceField(mol, props, confId=conf_id)
|
||||
ff.Minimize()
|
||||
energies.append(ff.CalcEnergy())
|
||||
min_conf = energies.index(min(energies))
|
||||
conf_ids = [conf.GetId() for conf in mol.GetConformers()]
|
||||
for cid in conf_ids:
|
||||
if cid != min_conf:
|
||||
mol.RemoveConformer(cid)
|
||||
AllChem.MMFFOptimizeMolecule(mol)
|
||||
|
||||
print(f"Fallback: 使用 RDKit 成功生成构象:{compound}")
|
||||
|
||||
# 保存生成的构象到 balloon_output 文件夹
|
||||
balloon_folder = Path.cwd() / "balloon_output"
|
||||
balloon_folder.mkdir(exist_ok=True)
|
||||
output_sdf = balloon_folder / f"fallback_{compound}.sdf"
|
||||
writer = Chem.SDWriter(str(output_sdf))
|
||||
writer.write(mol)
|
||||
writer.close()
|
||||
|
||||
return mol
|
||||
|
||||
def balloon_worker(args):
|
||||
"""
|
||||
多进程 worker:
|
||||
参数 args 为 (compound, smi, input_sdf, balloon_path)。
|
||||
使用 Balloon 对 input_sdf 文件进行 3D 构象生成,输出写入当前目录下的 “balloon_output” 文件夹,
|
||||
文件名为 <compound>.sdf。
|
||||
增加重试机制:如果一次调用失败(即在最多等待 5 秒内输出文件未生成),则重试最多 10 次,
|
||||
每次重试时打印重试日志信息;若10次都失败,则调用 fallback_rdkit_generate() 进行后备生成。
|
||||
返回生成的分子对象(或 None)。
|
||||
"""
|
||||
compound, smi, input_sdf, balloon_path = args
|
||||
balloon_folder = Path.cwd() / "balloon_output"
|
||||
balloon_folder.mkdir(exist_ok=True)
|
||||
output_sdf = balloon_folder / f"{compound}.sdf"
|
||||
|
||||
max_retries = 10 # 重试
|
||||
success = False
|
||||
for attempt in range(1, max_retries + 1):
|
||||
# 调用 Balloon 生成构象
|
||||
cmd = [
|
||||
balloon_path,
|
||||
"--nconfs", "1",
|
||||
"--strict",
|
||||
str(input_sdf),
|
||||
"-o", "sdf",
|
||||
str(output_sdf)
|
||||
]
|
||||
try:
|
||||
subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error: Balloon 处理 {compound} 第 {attempt} 次出错,输出:\n{e.output}")
|
||||
|
||||
# 最多等待 5 秒检查输出文件是否生成
|
||||
wait_time = 0
|
||||
while wait_time < 5:
|
||||
if output_sdf.exists():
|
||||
success = True
|
||||
break
|
||||
time.sleep(1)
|
||||
wait_time += 1
|
||||
|
||||
if success:
|
||||
break
|
||||
else:
|
||||
print(f"重试 {attempt}: {compound} 的输出文件 {output_sdf} 仍未生成。")
|
||||
|
||||
if not success:
|
||||
print(f"Warning: 尝试 {max_retries} 次后,{compound} 的 Balloon 输出仍不存在,采用 fallback。")
|
||||
return fallback_rdkit_generate(compound, smi)
|
||||
|
||||
with open(str(output_sdf), "r") as f:
|
||||
sdf_content = f.read()
|
||||
mol = Chem.MolFromMolBlock(sdf_content, sanitize=True, removeHs=False)
|
||||
if mol is None:
|
||||
print(f"Warning: Balloon 未能生成有效构象:{compound}")
|
||||
return fallback_rdkit_generate(compound, smi)
|
||||
return mol
|
||||
|
||||
def generate_3d_mols_from_balloon(csv_file, balloon_path):
|
||||
"""
|
||||
主函数:
|
||||
1. 调用 RDKit 生成 SDF 文件,保存在 “rdkit_output” 文件夹中(文件名为 <Compound>.sdf),
|
||||
分子对象中包含 CSV 中所有字段信息,便于溯源。
|
||||
2. 使用多进程调用 Balloon 对每个 SDF 文件生成 3D 构象,输出到 “balloon_output” 文件夹(文件名为 <Compound>.sdf)。
|
||||
若 Balloon 多次重试后仍失败,则使用 RDKit 的 ETKDGv3 生成 3D 构象,并保存 fallback 版本。
|
||||
3. 返回一个列表,每个元素为 (mol, True)。
|
||||
"""
|
||||
rdkit_files = generate_rdkit_sdf(csv_file)
|
||||
if not rdkit_files:
|
||||
print("没有生成任何 RDKit 的 SDF 文件。")
|
||||
return []
|
||||
|
||||
tasks = [(compound, smi, rdkit_sdf, balloon_path) for compound, smi, rdkit_sdf in rdkit_files]
|
||||
with mp.Pool() as pool:
|
||||
results = pool.map(balloon_worker, tasks)
|
||||
|
||||
mols = [mol for mol in results if mol is not None and mol.GetNumConformers() > 0]
|
||||
aligned_results = [(mol, True) for mol in mols]
|
||||
return aligned_results
|
||||
|
||||
if __name__ == "__main__":
|
||||
csv_file = "/root/project/qsar/1d-qsar/data_smi.csv" # 替换为实际 CSV 文件路径
|
||||
balloon_path = "/root/project/qsar/pycomsia/src/balloon" # Balloon 二进制程序路径
|
||||
|
||||
aligned_results = generate_3d_mols_from_balloon(csv_file, balloon_path)
|
||||
print(f"共生成 {len(aligned_results)} 个3D构象的分子。")
|
||||
Reference in New Issue
Block a user