generate_3d_structures

This commit is contained in:
mm644706215
2025-03-11 18:09:30 +08:00
parent e29fc499bf
commit 2041bd4ec3

View File

@@ -0,0 +1,339 @@
#!/usr/bin/env python
"""
完整训练代码示例
功能:
1. 从 CSV 文件中读取 SMILES 和目标变量计算2D描述符利用 RDKit 与 Mordred并对描述符进行 SelectKBest 特征选择。
2. 根据 CSV 文件中的 SMILES 生成分子的3D构象然后利用 MolecularGridCalculator 与 MolecularFieldCalculator 计算3DQSAR场特征
将5个场steric、electrostatic、hydrophobic、hbond_donor、hbond_acceptor展平后合并为一个特征向量。
3. 合并2D与3D特征并使用随机森林和 XGBoost 进行回归训练,同时支持 Optuna 超参数调优。
4. 通过命令行传递 CSV 文件路径和目标变量名称,使用 click 库实现,并提供详细帮助信息。
用法示例:
python main.py --data-smi data_smi.csv --target MIC_LOG_ATCC25923
"""
import os
import numpy as np
import pandas as pd
import click
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem
from mordred import Calculator, descriptors
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import optuna
# 导入3D相关模块请确保 MolecularGridCalculator.py 和 MolecularFieldCalculator.py 在同一目录下)
from MolecularGridCalculator import MolecularGridCalculator
from MolecularFieldCalculator import MolecularFieldCalculator
# ------------------------ 辅助函数 ------------------------
def safe_nunique(series):
"""安全计算唯一值数量,防止非标量数据出错"""
try:
return series.nunique(dropna=False)
except Exception:
return series.apply(lambda x: tuple(x) if isinstance(x, (np.ndarray, list)) and not pd.isnull(x) else x).nunique(dropna=False)
# ------------------------ 2D描述符计算 ------------------------
def get_rdkit_descriptors(smiles, missingVal=None):
"""利用 RDKit 计算分子所有描述符"""
mol = Chem.MolFromSmiles(smiles)
res = {}
if mol is None:
for nm, _ in Descriptors._descList:
res[nm] = missingVal
return res
for nm, fn in Descriptors._descList:
try:
res[nm] = fn(mol)
except Exception:
res[nm] = missingVal
return res
def get_mordred_descriptors(smiles):
"""利用 Mordred 计算分子描述符"""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return {}
calc = Calculator(descriptors)
try:
result = calc(mol)
return result.asdict()
except Exception:
return {}
def compute_2d_features(csv_file):
data = pd.read_csv(csv_file)
# RDKit描述符
rdkit_desc_list = data["SMILES"].apply(get_rdkit_descriptors)
X_df_rdkit = pd.DataFrame(rdkit_desc_list.tolist())
# 转换为数值类型并用均值填充(针对数值列)
X_df_rdkit = X_df_rdkit.apply(pd.to_numeric, errors='coerce')
X_df_rdkit = X_df_rdkit.fillna(X_df_rdkit.mean(numeric_only=True))
# Mordred描述符
mordred_desc_list = []
for smi in data["SMILES"]:
mordred_desc_list.append(get_mordred_descriptors(smi))
X_df_mordred = pd.DataFrame(mordred_desc_list)
X_df_mordred.dropna(axis=1, how='all', inplace=True)
invalid_features = [col for col in X_df_mordred.columns if safe_nunique(X_df_mordred[col]) <= 1]
if invalid_features:
print("Mordred 删除恒定特征:", invalid_features)
X_df_mordred.drop(columns=invalid_features, inplace=True)
X_df_mordred = X_df_mordred.apply(pd.to_numeric, errors='coerce')
X_df_mordred = X_df_mordred.fillna(X_df_mordred.mean(numeric_only=True))
# 合并两部分描述符
X_df = pd.concat([X_df_rdkit, X_df_mordred], axis=1)
X_df = X_df.loc[:, ~X_df.columns.duplicated()]
constant_features = [col for col in X_df.columns if safe_nunique(X_df[col]) <= 1]
if constant_features:
print("Combined 删除恒定特征:", constant_features)
X_df.drop(columns=constant_features, inplace=True)
# 检查哪些列仍包含非数值数据(即转换时出现了 NaN但原始数据非空
non_numeric_features = {}
for col in X_df.columns:
# 尝试将列转换为数值
col_numeric = pd.to_numeric(X_df[col], errors='coerce')
# 如果原始列中有非空值但转换后对应位置为NaN则说明存在非数值数据
mask = X_df[col].notna() & col_numeric.isna()
if mask.any():
unique_vals = set(X_df.loc[mask, col].unique())
non_numeric_features[col] = unique_vals
if non_numeric_features:
print("以下特征包含非数值数据(经过 set 去重):")
for col, uniq in non_numeric_features.items():
print(f"{col}: {uniq}")
else:
print("所有特征均为数值类型。")
# 返回处理好的 DataFrame
return X_df
def select_2d_features(X_df, y, k=10):
"""
使用 SelectKBest 选择 2D 描述符的前 k 个特征,
返回转换后的特征矩阵和选中特征名称。
"""
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X_df, y)
selected_features = X_df.columns[selector.get_support()].tolist()
print("SelectKBest 选中的2D特征", selected_features)
X_selected = selector.transform(X_df)
return X_selected, selected_features
# ------------------------ 3D-QSAR特征计算从 CSV 中 SMILES 生成3D构象 ------------------------
def generate_3d_mols_from_csv(csv_file, mmffVariant='MMFF94'):
"""
从 CSV 文件中读取 SMILES生成分子的3D构象添加氢原子、嵌入构象并进行MMFF能量最小化
针对大环分子,启用了宏环扭转角优化并增大了嵌入尝试次数。
mmffVariantMMFF94或MMFF94S默认为 MMFF94
返回值:分子列表,每个元素为 (mol, True)
"""
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
data = pd.read_csv(csv_file)
mols = []
# 距离几何+ETKDG生成3D构象
for smi in data["SMILES"]:
mol = Chem.MolFromSmiles(smi)
if mol is None:
print(f"Warning: 无法从 SMILES {smi} 生成分子。")
continue
m3d = Chem.AddHs(mol)
AllChem.EmbedMolecule(m3d, randomSeed=10, useMacrocycleTorsions=True)
# MMFF生成3D构象 优化
if m3d.GetNumConformers() > 0:
AllChem.MMFFOptimizeMolecule(m3d)
mols.append(m3d)
else:
print(f"Warning: 分子 {smi} 未生成构象。")
aligned_results = [(mol, True) for mol in mols if mol.GetNumConformers() > 0]
return aligned_results
def compute_3d_features_with_params_from_csv(csv_file, grid_spacing, padding, alpha):
"""
根据 CSV 文件中的 SMILES 生成分子的3D构象后利用 MolecularGridCalculator 与 MolecularFieldCalculator 计算3D-QSAR场特征
参数 grid_spacing、padding 和 alpha 可调。
返回一个 shape=(n_samples, feature_dim) 的特征矩阵。
"""
# 生成3D分子
aligned_results = generate_3d_mols_from_csv(csv_file)
grid_calc = MolecularGridCalculator()
field_calc = MolecularFieldCalculator()
grid_spacing_tuple, grid_dimensions, grid_origin = grid_calc.generate_grid(aligned_results, resolution=grid_spacing, padding=padding)
# 修改高斯衰减参数 alpha
field_calc.ALPHA = alpha
fields_dict = field_calc.calc_field(aligned_results, grid_spacing_tuple, grid_dimensions, grid_origin)
selected_field_names = ["steric_field", "electrostatic_field", "hydrophobic_field", "hbond_donor_field", "hbond_acceptor_field"]
X_3d_list = []
n_mols = len(aligned_results)
for i in range(n_mols):
feat_vec = []
for field in selected_field_names:
field_vec = fields_dict['train_fields'][field][i]
feat_vec.extend(field_vec)
X_3d_list.append(feat_vec)
X_3d = np.array(X_3d_list)
return X_3d
def split_fields_from_X3d(X_3d_all, field_dims):
"""
根据各场的维度将3D特征矩阵 X_3d_all 拆分为列表,
field_dims 为各场展平后特征的维度列表。
"""
fields = []
start = 0
for dim in field_dims:
fields.append(X_3d_all[:, start:start+dim])
start += dim
return fields
# ------------------------ 模型训练与评估 ------------------------
def evaluate_model(X, y, random_state=42):
"""
将数据划分为80:20训练随机森林和 XGBoost 回归模型,
返回各模型的 R²、RMSE、MAE 指标字典。
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
models = {
"RandomForest": RandomForestRegressor(random_state=random_state),
"XGBoost": xgb.XGBRegressor(random_state=random_state, verbosity=0)
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
results[name] = {"R2": r2, "RMSE": rmse, "MAE": mae}
return results
# ------------------------ 超参数调优Optuna ------------------------
def objective(trial):
# 超参数选择使用哪些3D场
use_steric = trial.suggest_categorical("use_steric", [True, False])
use_electrostatic = trial.suggest_categorical("use_electrostatic", [True, False])
use_hydrophobic = trial.suggest_categorical("use_hydrophobic", [True, False])
use_hbond_donor = trial.suggest_categorical("use_hbond_donor", [True, False])
use_hbond_acceptor = trial.suggest_categorical("use_hbond_acceptor", [True, False])
# 对每个场设置权重若未选中则为0
weight_steric = trial.suggest_float("weight_steric", 0.0, 2.0) if use_steric else 0.0
weight_electrostatic = trial.suggest_float("weight_electrostatic", 0.0, 2.0) if use_electrostatic else 0.0
weight_hydrophobic = trial.suggest_float("weight_hydrophobic", 0.0, 2.0) if use_hydrophobic else 0.0
weight_hbond_donor = trial.suggest_float("weight_hbond_donor", 0.0, 2.0) if use_hbond_donor else 0.0
weight_hbond_acceptor = trial.suggest_float("weight_hbond_acceptor", 0.0, 2.0) if use_hbond_acceptor else 0.0
# 网格与衰减参数
grid_spacing = trial.suggest_float("grid_spacing", 0.5, 2.0)
alpha = trial.suggest_float("alpha", 0.1, 1.0)
# 随机森林超参数
n_estimators = trial.suggest_int("n_estimators", 50, 300)
max_depth = trial.suggest_int("max_depth", 3, 15)
# 计算3D特征从 CSV 中生成3D构象
csv_file = click.get_current_context().params.get("data_smi")
X_3d_all = compute_3d_features_with_params_from_csv(csv_file, grid_spacing=grid_spacing, padding=3, alpha=alpha)
# 假设每个场展平后的维度为 total_dim/5
dim_per_field = X_3d_all.shape[1] // 5
field_dims = [dim_per_field] * 5
X_3d_fields = split_fields_from_X3d(X_3d_all, field_dims)
# 根据选择情况与权重组合3D特征
selected_fields = []
for flag, weight, field in zip(
[use_steric, use_electrostatic, use_hydrophobic, use_hbond_donor, use_hbond_acceptor],
[weight_steric, weight_electrostatic, weight_hydrophobic, weight_hbond_donor, weight_hbond_acceptor],
X_3d_fields):
if flag:
selected_fields.append(field * weight)
if selected_fields:
X_3d_selected = np.hstack(selected_fields)
else:
X_3d_selected = np.zeros((X_3d_all.shape[0], 1))
# 读取2D特征SelectKBest后的结果
data = pd.read_csv(csv_file)
target = click.get_current_context().params.get("target")
y = data[target].values
X_df = compute_2d_features(csv_file)
X_2d_selected, _ = select_2d_features(X_df, y, k=10)
# 合并2D与3D特征
X_combined = np.hstack((X_2d_selected, X_3d_selected))
# 划分训练/验证集并评估模型
X_train, X_val, y_train, y_val = train_test_split(X_combined, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
return rmse
# ------------------------ 主程序入口Click命令行 ------------------------
@click.command()
@click.option('--data-smi', required=True, type=click.Path(exists=True),
help="包含 SMILES 和目标变量的 CSV 文件路径。")
@click.option('--target', default="MIC_LOG_ATCC25923",
help="目标变量列名默认MIC_LOG_ATCC25923。")
def cli(data_smi, target):
"""
通过命令行启动QSAR模型训练和超参数调优流程。
示例:
python main.py --data-smi data_smi.csv --target MIC_LOG_ATCC25923
"""
ctx = click.get_current_context()
ctx.params["data_smi"] = data_smi
ctx.params["target"] = target
# 计算2D描述符和SelectKBest特征
data = pd.read_csv(data_smi)
y = data[target].values
X_df = compute_2d_features(data_smi)
X_2d_selected, selected_feats = select_2d_features(X_df, y, k=10)
click.echo("2D特征选择完毕{}".format(selected_feats))
# 计算3D特征从 CSV 中 SMILES生成3D构象
X_3d = compute_3d_features_with_params_from_csv(data_smi, grid_spacing=1.0, padding=3, alpha=0.3)
click.echo("3D特征计算完毕。")
# 合并2D与3D特征
X_combined = np.hstack((X_2d_selected, X_3d))
click.echo("开始初步模型训练评估(不调超参)...")
results_2d = evaluate_model(X_2d_selected, y)
results_combined = evaluate_model(X_combined, y)
click.echo("仅2D描述符模型结果{}".format(results_2d))
click.echo("2D + 3D描述符模型结果{}".format(results_combined))
click.echo("开始使用Optuna进行超参数调优...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
click.echo("Optuna最佳试验结果")
click.echo(study.best_trial)
if __name__ == "__main__":
cli()
# python main.py --data-smi /root/project/qsar/1d-qsar/data_smi.csv --target MIC_LOG_ATCC25923