Files
qsar/1d-qsar/cuda/RFE_cuml_permutation.py
mm644706215 4cb2d9f56c add 1dqsar
2025-03-03 20:23:09 +08:00

218 lines
8.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation.py
@Description: 使用GPU cuML实现特征选择手动实现RFECV并通过置换重要性计算来剔除最不重要的特征
同时通过交叉验证确定合理的随机森林树数量动态扩展直到CV MSE收敛
并将结果保存为图像,最后使用最佳树数进行训练。
@Date: 2025/03/01
@Author: lyzeng修改版
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ---------------------------
# 数据加载与特征构造
# ---------------------------
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]
from rdkit import Chem
from rdkit.Chem import Descriptors
def getMolDescriptors(smiles: str, missingVal=None):
mol = Chem.MolFromSmiles(smiles)
res = {}
for nm, fn in Descriptors._descList:
try:
val = fn(mol)
except Exception as e:
val = missingVal
res[nm] = val
return res
# 构建特征矩阵与目标变量
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
y = target_data['TARGET'].values
# 剔除无效特征(取值唯一的列)
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
X_df.drop(columns=invalid_features, inplace=True)
X = X_df.values
print(f"训练样本数: {len(y)}, 特征数量: {X.shape[1]}")
# ---------------------------
# 转换为GPU数据格式cupy数组
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)
# ---------------------------
# 定义辅助函数计算模型的MSE
# ---------------------------
def model_mse(model, X_data, y_data):
y_pred = model.predict(X_data)
mse = cp.mean((y_data - y_pred) ** 2)
return mse.get()
# ---------------------------
# 定义置换重要性计算函数在GPU上
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
"""
对于给定的模型,计算每个特征的置换重要性:
重要性 = (打乱特征后的MSE均值) - (原始MSE)
随机打乱每个特征的值,并观察模型性能的下降程度来衡量特征的重要性。
"""
X_cpu = cp.asnumpy(X_data) # 将数据复制到CPU进行打乱
baseline = model_mse(model, X_data, y_data)
importances = np.zeros(X_cpu.shape[1])
rng = np.random.RandomState(random_state)
for j in range(X_cpu.shape[1]):
scores = []
for _ in range(n_repeats):
X_permuted = X_cpu.copy()
rng.shuffle(X_permuted[:, j])
X_permuted_gpu = cp.asarray(X_permuted)
score = model_mse(model, X_permuted_gpu, y_data)
scores.append(score)
importances[j] = np.mean(scores) - baseline
return importances
# ---------------------------
# 修改交叉验证函数:允许传入树的数量
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold
def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
"""
使用 KFold 分割数据,利用 cuML 随机森林评估当前特征子集的CV均方误差。
允许设置随机森林的树数量 n_estimators。
"""
kf = KFold(n_splits=cv, shuffle=True, random_state=0)
mse_scores = []
X_np = cp.asnumpy(X_data) # 使用CPU索引进行分割
for train_index, test_index in kf.split(X_np):
X_train = X_data[train_index, :]
X_test = X_data[test_index, :]
y_train = y_data[train_index]
y_test = y_data[test_index]
model = cuRF(n_estimators=n_estimators, random_state=0)
model.fit(X_train, y_train)
mse = model_mse(model, X_test, y_test)
mse_scores.append(mse)
return np.mean(mse_scores)
# ---------------------------
# 动态评估随机森林树数量扩展候选数直到CV MSE收敛
# ---------------------------
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
"""
从 start 开始,每次增加 step直到连续两次 CV MSE 的差异小于 threshold
或达到 max_estimators。返回字典 {树数量: CV MSE}。
"""
results = {}
candidate = start
prev_mse = None
while candidate <= max_estimators:
mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
results[candidate] = mse
print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
if prev_mse is not None and abs(prev_mse - mse) < threshold:
break
prev_mse = mse
candidate += step
return results
# 调用动态评估函数
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)
# 绘制树数量与 CV MSE 关系图并保存
plt.figure(figsize=(8, 5))
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
plt.xlabel('Random Forest 树数量 (n_estimators)')
plt.ylabel('CV MSE')
plt.title('不同树数量下的交叉验证MSE')
plt.grid(True)
plt.savefig("tree_vs_cv_mse.png", dpi=300)
plt.close()
# 确定最佳的树数量CV MSE最小对应的树数
best_n_estimators = min(tree_results, key=tree_results.get)
print(f"最佳随机森林树数量确定为: {best_n_estimators}")
# ---------------------------
# 手动实现 RFECV并记录置换重要性变化过程
# 这里传入最佳树数量 best_n_estimators 以用于模型训练
# ---------------------------
def manual_rfecv(X_data, y_data, cv=5, n_estimators=100):
"""
手动实现 RFECV:
- 在当前特征子集上计算CV MSE
- 使用置换重要性评估各特征的重要性
- 移除置换重要性最低的特征,并记录被移除特征的置换重要性
- 记录使CV MSE最小的特征组合
返回最佳特征组合、最佳CV MSE、历史记录及置换重要性历史。
"""
n_features = X_data.shape[1]
features = list(range(n_features))
best_score = float('inf')
best_features = features.copy()
scores_history = []
perm_imp_history = [] # 记录每次剔除的最小置换重要性
while len(features) > 0:
current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
scores_history.append((features.copy(), current_score))
print(f"当前特征数: {len(features)}, CV MSE: {current_score:.4f}")
if current_score < best_score:
best_score = current_score
best_features = features.copy()
if len(features) == 1:
break
# 训练模型并计算置换重要性
model = cuRF(n_estimators=n_estimators, random_state=0)
model.fit(X_data[:, features], y_data)
imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
idx_to_remove = int(np.argmin(imp))
removed_feature = features[idx_to_remove]
removed_imp = imp[idx_to_remove]
perm_imp_history.append(removed_imp)
print(f"剔除特征索引: {removed_feature},置换重要性: {removed_imp:.4f}")
del features[idx_to_remove]
return best_features, best_score, scores_history, perm_imp_history
# 执行手动 RFECV使用5折交叉验证和最佳树数
cv_folds = 5
best_features_rfecv, best_mse_rfecv, history, perm_imp_history = manual_rfecv(X_gpu, y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\n手动RFECV选择了 {len(best_features_rfecv)} 个特征CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("选定特征名称:", selected_feature_names)
# 绘制置换重要性变化图并保存
plt.figure(figsize=(8, 5))
iterations = list(range(1, len(perm_imp_history) + 1))
plt.plot(iterations, perm_imp_history, marker='o')
plt.xlabel('RFECV 迭代次数')
plt.ylabel('被剔除特征的置换重要性')
plt.title('RFECV过程中置换重要性变化')
plt.grid(True)
plt.savefig("rfecv_perm_importance.png", dpi=300)
plt.close()
# ---------------------------
# 最终模型训练:使用最佳特征和最佳随机森林树数量训练最终模型
# ---------------------------
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\n最终模型最佳特征、n_estimators={best_n_estimators}CV MSE: {final_cv_mse:.4f}")