add train qsar model script
This commit is contained in:
182
pycomsia/src/generate_3d_structures.py
Normal file
182
pycomsia/src/generate_3d_structures.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import multiprocessing as mp
|
||||||
|
import pandas as pd
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from rdkit import Chem
|
||||||
|
from rdkit.Chem import AllChem
|
||||||
|
|
||||||
|
def generate_rdkit_sdf(csv_file):
|
||||||
|
"""
|
||||||
|
读取 CSV 中的所有字段,
|
||||||
|
利用 RDKit 生成分子对象(2D,添加氢并嵌入一个初步构象),
|
||||||
|
并保存为 SDF 文件到当前目录下的 “rdkit_output” 文件夹中,
|
||||||
|
文件名为 <Compound>.sdf,同时在分子属性中写入 CSV 中所有信息。
|
||||||
|
|
||||||
|
返回一个列表,每个元素为 (compound, smi, rdkit_sdf_path)。
|
||||||
|
"""
|
||||||
|
data = pd.read_csv(csv_file)
|
||||||
|
required_cols = {"SMILES", "Compound"}
|
||||||
|
if not required_cols.issubset(set(data.columns)):
|
||||||
|
raise ValueError("CSV 文件必须包含 'SMILES' 和 'Compound' 列。")
|
||||||
|
|
||||||
|
rdkit_folder = Path.cwd() / "rdkit_output"
|
||||||
|
rdkit_folder.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
file_list = []
|
||||||
|
for _, row in data.iterrows():
|
||||||
|
smi = row["SMILES"]
|
||||||
|
compound = str(row["Compound"])
|
||||||
|
mol = Chem.MolFromSmiles(smi)
|
||||||
|
if mol is None:
|
||||||
|
print(f"Warning: 无法从 SMILES {smi} 生成分子对象。")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 遍历 CSV 这一行的所有列,添加到分子属性中
|
||||||
|
for key, value in row.items():
|
||||||
|
mol.SetProp(str(key), str(value))
|
||||||
|
|
||||||
|
# 添加氢并生成初步构象(2D)
|
||||||
|
mol = Chem.AddHs(mol)
|
||||||
|
AllChem.EmbedMolecule(mol)
|
||||||
|
|
||||||
|
out_file = rdkit_folder / f"{compound}.sdf"
|
||||||
|
writer = Chem.SDWriter(str(out_file))
|
||||||
|
writer.write(mol)
|
||||||
|
writer.close()
|
||||||
|
file_list.append((compound, smi, out_file))
|
||||||
|
return file_list
|
||||||
|
|
||||||
|
def fallback_rdkit_generate(compound, smi):
|
||||||
|
"""
|
||||||
|
使用 RDKit 的 ETKDGv3 方法尝试生成 3D 构象,并将生成的构象保存到 balloon_output 文件夹中,
|
||||||
|
文件名为 fallback_<compound>.sdf,同时打印日志。
|
||||||
|
"""
|
||||||
|
mol = Chem.MolFromSmiles(smi)
|
||||||
|
if mol is None:
|
||||||
|
print(f"Warning: fallback 无法从 SMILES {smi} 生成分子对象。")
|
||||||
|
return None
|
||||||
|
# 添加 CSV 中所有属性,这里至少添加 Compound 信息
|
||||||
|
mol.SetProp("Compound", compound)
|
||||||
|
|
||||||
|
mol = Chem.AddHs(mol)
|
||||||
|
params = AllChem.ETKDGv3()
|
||||||
|
params.randomSeed = 10
|
||||||
|
params.useMacrocycleTorsions = True
|
||||||
|
params.maxAttempts = 10000
|
||||||
|
_ = AllChem.EmbedMultipleConfs(mol, numConfs=5, params=params)
|
||||||
|
if mol.GetNumConformers() == 0:
|
||||||
|
print(f"Warning: fallback 分子 {smi} 未生成构象。")
|
||||||
|
return None
|
||||||
|
energies = []
|
||||||
|
for conf_id in range(mol.GetNumConformers()):
|
||||||
|
props = AllChem.MMFFGetMoleculeProperties(mol)
|
||||||
|
ff = AllChem.MMFFGetMoleculeForceField(mol, props, confId=conf_id)
|
||||||
|
ff.Minimize()
|
||||||
|
energies.append(ff.CalcEnergy())
|
||||||
|
min_conf = energies.index(min(energies))
|
||||||
|
conf_ids = [conf.GetId() for conf in mol.GetConformers()]
|
||||||
|
for cid in conf_ids:
|
||||||
|
if cid != min_conf:
|
||||||
|
mol.RemoveConformer(cid)
|
||||||
|
AllChem.MMFFOptimizeMolecule(mol)
|
||||||
|
|
||||||
|
print(f"Fallback: 使用 RDKit 成功生成构象:{compound}")
|
||||||
|
|
||||||
|
# 保存生成的构象到 balloon_output 文件夹
|
||||||
|
balloon_folder = Path.cwd() / "balloon_output"
|
||||||
|
balloon_folder.mkdir(exist_ok=True)
|
||||||
|
output_sdf = balloon_folder / f"fallback_{compound}.sdf"
|
||||||
|
writer = Chem.SDWriter(str(output_sdf))
|
||||||
|
writer.write(mol)
|
||||||
|
writer.close()
|
||||||
|
|
||||||
|
return mol
|
||||||
|
|
||||||
|
def balloon_worker(args):
|
||||||
|
"""
|
||||||
|
多进程 worker:
|
||||||
|
参数 args 为 (compound, smi, input_sdf, balloon_path)。
|
||||||
|
使用 Balloon 对 input_sdf 文件进行 3D 构象生成,输出写入当前目录下的 “balloon_output” 文件夹,
|
||||||
|
文件名为 <compound>.sdf。
|
||||||
|
增加重试机制:如果一次调用失败(即在最多等待 5 秒内输出文件未生成),则重试最多 10 次,
|
||||||
|
每次重试时打印重试日志信息;若10次都失败,则调用 fallback_rdkit_generate() 进行后备生成。
|
||||||
|
返回生成的分子对象(或 None)。
|
||||||
|
"""
|
||||||
|
compound, smi, input_sdf, balloon_path = args
|
||||||
|
balloon_folder = Path.cwd() / "balloon_output"
|
||||||
|
balloon_folder.mkdir(exist_ok=True)
|
||||||
|
output_sdf = balloon_folder / f"{compound}.sdf"
|
||||||
|
|
||||||
|
max_retries = 10 # 重试
|
||||||
|
success = False
|
||||||
|
for attempt in range(1, max_retries + 1):
|
||||||
|
# 调用 Balloon 生成构象
|
||||||
|
cmd = [
|
||||||
|
balloon_path,
|
||||||
|
"--nconfs", "1",
|
||||||
|
"--strict",
|
||||||
|
str(input_sdf),
|
||||||
|
"-o", "sdf",
|
||||||
|
str(output_sdf)
|
||||||
|
]
|
||||||
|
try:
|
||||||
|
subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Error: Balloon 处理 {compound} 第 {attempt} 次出错,输出:\n{e.output}")
|
||||||
|
|
||||||
|
# 最多等待 5 秒检查输出文件是否生成
|
||||||
|
wait_time = 0
|
||||||
|
while wait_time < 5:
|
||||||
|
if output_sdf.exists():
|
||||||
|
success = True
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
wait_time += 1
|
||||||
|
|
||||||
|
if success:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"重试 {attempt}: {compound} 的输出文件 {output_sdf} 仍未生成。")
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
print(f"Warning: 尝试 {max_retries} 次后,{compound} 的 Balloon 输出仍不存在,采用 fallback。")
|
||||||
|
return fallback_rdkit_generate(compound, smi)
|
||||||
|
|
||||||
|
with open(str(output_sdf), "r") as f:
|
||||||
|
sdf_content = f.read()
|
||||||
|
mol = Chem.MolFromMolBlock(sdf_content, sanitize=True, removeHs=False)
|
||||||
|
if mol is None:
|
||||||
|
print(f"Warning: Balloon 未能生成有效构象:{compound}")
|
||||||
|
return fallback_rdkit_generate(compound, smi)
|
||||||
|
return mol
|
||||||
|
|
||||||
|
def generate_3d_mols_from_balloon(csv_file, balloon_path):
|
||||||
|
"""
|
||||||
|
主函数:
|
||||||
|
1. 调用 RDKit 生成 SDF 文件,保存在 “rdkit_output” 文件夹中(文件名为 <Compound>.sdf),
|
||||||
|
分子对象中包含 CSV 中所有字段信息,便于溯源。
|
||||||
|
2. 使用多进程调用 Balloon 对每个 SDF 文件生成 3D 构象,输出到 “balloon_output” 文件夹(文件名为 <Compound>.sdf)。
|
||||||
|
若 Balloon 多次重试后仍失败,则使用 RDKit 的 ETKDGv3 生成 3D 构象,并保存 fallback 版本。
|
||||||
|
3. 返回一个列表,每个元素为 (mol, True)。
|
||||||
|
"""
|
||||||
|
rdkit_files = generate_rdkit_sdf(csv_file)
|
||||||
|
if not rdkit_files:
|
||||||
|
print("没有生成任何 RDKit 的 SDF 文件。")
|
||||||
|
return []
|
||||||
|
|
||||||
|
tasks = [(compound, smi, rdkit_sdf, balloon_path) for compound, smi, rdkit_sdf in rdkit_files]
|
||||||
|
with mp.Pool() as pool:
|
||||||
|
results = pool.map(balloon_worker, tasks)
|
||||||
|
|
||||||
|
mols = [mol for mol in results if mol is not None and mol.GetNumConformers() > 0]
|
||||||
|
aligned_results = [(mol, True) for mol in mols]
|
||||||
|
return aligned_results
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
csv_file = "/root/project/qsar/1d-qsar/data_smi.csv" # 替换为实际 CSV 文件路径
|
||||||
|
balloon_path = "/root/project/qsar/pycomsia/src/balloon" # Balloon 二进制程序路径
|
||||||
|
|
||||||
|
aligned_results = generate_3d_mols_from_balloon(csv_file, balloon_path)
|
||||||
|
print(f"共生成 {len(aligned_results)} 个3D构象的分子。")
|
||||||
Reference in New Issue
Block a user