add train qsar model script

This commit is contained in:
mm644706215
2025-03-11 18:25:08 +08:00
parent 2041bd4ec3
commit cbd7b96d54
2 changed files with 182 additions and 0 deletions

View File

@@ -0,0 +1,182 @@
import os
import subprocess
import multiprocessing as mp
import pandas as pd
import time
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem
def generate_rdkit_sdf(csv_file):
"""
读取 CSV 中的所有字段,
利用 RDKit 生成分子对象2D添加氢并嵌入一个初步构象
并保存为 SDF 文件到当前目录下的 “rdkit_output” 文件夹中,
文件名为 <Compound>.sdf同时在分子属性中写入 CSV 中所有信息。
返回一个列表,每个元素为 (compound, smi, rdkit_sdf_path)。
"""
data = pd.read_csv(csv_file)
required_cols = {"SMILES", "Compound"}
if not required_cols.issubset(set(data.columns)):
raise ValueError("CSV 文件必须包含 'SMILES''Compound' 列。")
rdkit_folder = Path.cwd() / "rdkit_output"
rdkit_folder.mkdir(exist_ok=True)
file_list = []
for _, row in data.iterrows():
smi = row["SMILES"]
compound = str(row["Compound"])
mol = Chem.MolFromSmiles(smi)
if mol is None:
print(f"Warning: 无法从 SMILES {smi} 生成分子对象。")
continue
# 遍历 CSV 这一行的所有列,添加到分子属性中
for key, value in row.items():
mol.SetProp(str(key), str(value))
# 添加氢并生成初步构象2D
mol = Chem.AddHs(mol)
AllChem.EmbedMolecule(mol)
out_file = rdkit_folder / f"{compound}.sdf"
writer = Chem.SDWriter(str(out_file))
writer.write(mol)
writer.close()
file_list.append((compound, smi, out_file))
return file_list
def fallback_rdkit_generate(compound, smi):
"""
使用 RDKit 的 ETKDGv3 方法尝试生成 3D 构象,并将生成的构象保存到 balloon_output 文件夹中,
文件名为 fallback_<compound>.sdf同时打印日志。
"""
mol = Chem.MolFromSmiles(smi)
if mol is None:
print(f"Warning: fallback 无法从 SMILES {smi} 生成分子对象。")
return None
# 添加 CSV 中所有属性,这里至少添加 Compound 信息
mol.SetProp("Compound", compound)
mol = Chem.AddHs(mol)
params = AllChem.ETKDGv3()
params.randomSeed = 10
params.useMacrocycleTorsions = True
params.maxAttempts = 10000
_ = AllChem.EmbedMultipleConfs(mol, numConfs=5, params=params)
if mol.GetNumConformers() == 0:
print(f"Warning: fallback 分子 {smi} 未生成构象。")
return None
energies = []
for conf_id in range(mol.GetNumConformers()):
props = AllChem.MMFFGetMoleculeProperties(mol)
ff = AllChem.MMFFGetMoleculeForceField(mol, props, confId=conf_id)
ff.Minimize()
energies.append(ff.CalcEnergy())
min_conf = energies.index(min(energies))
conf_ids = [conf.GetId() for conf in mol.GetConformers()]
for cid in conf_ids:
if cid != min_conf:
mol.RemoveConformer(cid)
AllChem.MMFFOptimizeMolecule(mol)
print(f"Fallback: 使用 RDKit 成功生成构象:{compound}")
# 保存生成的构象到 balloon_output 文件夹
balloon_folder = Path.cwd() / "balloon_output"
balloon_folder.mkdir(exist_ok=True)
output_sdf = balloon_folder / f"fallback_{compound}.sdf"
writer = Chem.SDWriter(str(output_sdf))
writer.write(mol)
writer.close()
return mol
def balloon_worker(args):
"""
多进程 worker
参数 args 为 (compound, smi, input_sdf, balloon_path)。
使用 Balloon 对 input_sdf 文件进行 3D 构象生成,输出写入当前目录下的 “balloon_output” 文件夹,
文件名为 <compound>.sdf。
增加重试机制:如果一次调用失败(即在最多等待 5 秒内输出文件未生成),则重试最多 10 次,
每次重试时打印重试日志信息若10次都失败则调用 fallback_rdkit_generate() 进行后备生成。
返回生成的分子对象(或 None
"""
compound, smi, input_sdf, balloon_path = args
balloon_folder = Path.cwd() / "balloon_output"
balloon_folder.mkdir(exist_ok=True)
output_sdf = balloon_folder / f"{compound}.sdf"
max_retries = 10 # 重试
success = False
for attempt in range(1, max_retries + 1):
# 调用 Balloon 生成构象
cmd = [
balloon_path,
"--nconfs", "1",
"--strict",
str(input_sdf),
"-o", "sdf",
str(output_sdf)
]
try:
subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True)
except subprocess.CalledProcessError as e:
print(f"Error: Balloon 处理 {compound}{attempt} 次出错,输出:\n{e.output}")
# 最多等待 5 秒检查输出文件是否生成
wait_time = 0
while wait_time < 5:
if output_sdf.exists():
success = True
break
time.sleep(1)
wait_time += 1
if success:
break
else:
print(f"重试 {attempt}: {compound} 的输出文件 {output_sdf} 仍未生成。")
if not success:
print(f"Warning: 尝试 {max_retries} 次后,{compound} 的 Balloon 输出仍不存在,采用 fallback。")
return fallback_rdkit_generate(compound, smi)
with open(str(output_sdf), "r") as f:
sdf_content = f.read()
mol = Chem.MolFromMolBlock(sdf_content, sanitize=True, removeHs=False)
if mol is None:
print(f"Warning: Balloon 未能生成有效构象:{compound}")
return fallback_rdkit_generate(compound, smi)
return mol
def generate_3d_mols_from_balloon(csv_file, balloon_path):
"""
主函数:
1. 调用 RDKit 生成 SDF 文件,保存在 “rdkit_output” 文件夹中(文件名为 <Compound>.sdf
分子对象中包含 CSV 中所有字段信息,便于溯源。
2. 使用多进程调用 Balloon 对每个 SDF 文件生成 3D 构象,输出到 “balloon_output” 文件夹(文件名为 <Compound>.sdf
若 Balloon 多次重试后仍失败,则使用 RDKit 的 ETKDGv3 生成 3D 构象,并保存 fallback 版本。
3. 返回一个列表,每个元素为 (mol, True)。
"""
rdkit_files = generate_rdkit_sdf(csv_file)
if not rdkit_files:
print("没有生成任何 RDKit 的 SDF 文件。")
return []
tasks = [(compound, smi, rdkit_sdf, balloon_path) for compound, smi, rdkit_sdf in rdkit_files]
with mp.Pool() as pool:
results = pool.map(balloon_worker, tasks)
mols = [mol for mol in results if mol is not None and mol.GetNumConformers() > 0]
aligned_results = [(mol, True) for mol in mols]
return aligned_results
if __name__ == "__main__":
csv_file = "/root/project/qsar/1d-qsar/data_smi.csv" # 替换为实际 CSV 文件路径
balloon_path = "/root/project/qsar/pycomsia/src/balloon" # Balloon 二进制程序路径
aligned_results = generate_3d_mols_from_balloon(csv_file, balloon_path)
print(f"共生成 {len(aligned_results)} 个3D构象的分子。")