#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file: RFE_cuml_permutation.py @Description: 使用GPU cuML实现特征选择,手动实现RFECV,并通过置换重要性计算来剔除最不重要的特征, 同时通过交叉验证确定合理的随机森林树数量(动态扩展直到CV MSE收敛), 并将结果保存为图像,最后使用最佳树数进行训练。 @Date: 2025/03/01 @Author: lyzeng(修改版) ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt # --------------------------- # 数据加载与特征构造 # --------------------------- data = pd.read_csv("../data_smi.csv", index_col="Entry ID") target_data = data[["SMILES", "S.aureus ATCC25923"]] target_data.columns = ["SMILES", "TARGET"] from rdkit import Chem from rdkit.Chem import Descriptors def getMolDescriptors(smiles: str, missingVal=None): mol = Chem.MolFromSmiles(smiles) res = {} for nm, fn in Descriptors._descList: try: val = fn(mol) except Exception as e: val = missingVal res[nm] = val return res # 构建特征矩阵与目标变量 X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']]) y = target_data['TARGET'].values # 剔除无效特征(取值唯一的列) invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1] X_df.drop(columns=invalid_features, inplace=True) X = X_df.values print(f"训练样本数: {len(y)}, 特征数量: {X.shape[1]}") # --------------------------- # 转换为GPU数据格式(cupy数组) # --------------------------- import cupy as cp X_gpu = cp.asarray(X) y_gpu = cp.asarray(y) # --------------------------- # 定义辅助函数:计算模型的MSE # --------------------------- def model_mse(model, X_data, y_data): y_pred = model.predict(X_data) mse = cp.mean((y_data - y_pred) ** 2) return mse.get() # --------------------------- # 定义置换重要性计算函数(在GPU上) # --------------------------- def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0): """ 对于给定的模型,计算每个特征的置换重要性: 重要性 = (打乱特征后的MSE均值) - (原始MSE) 随机打乱每个特征的值,并观察模型性能的下降程度来衡量特征的重要性。 """ X_cpu = cp.asnumpy(X_data) # 将数据复制到CPU进行打乱 baseline = model_mse(model, X_data, y_data) importances = np.zeros(X_cpu.shape[1]) rng = np.random.RandomState(random_state) for j in range(X_cpu.shape[1]): scores = [] for _ in range(n_repeats): X_permuted = X_cpu.copy() rng.shuffle(X_permuted[:, j]) X_permuted_gpu = cp.asarray(X_permuted) score = model_mse(model, X_permuted_gpu, y_data) scores.append(score) importances[j] = np.mean(scores) - baseline return importances # --------------------------- # 修改交叉验证函数:允许传入树的数量 # --------------------------- from cuml.ensemble import RandomForestRegressor as cuRF from sklearn.model_selection import KFold def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100): """ 使用 KFold 分割数据,利用 cuML 随机森林评估当前特征子集的CV均方误差。 允许设置随机森林的树数量 n_estimators。 """ kf = KFold(n_splits=cv, shuffle=True, random_state=0) mse_scores = [] X_np = cp.asnumpy(X_data) # 使用CPU索引进行分割 for train_index, test_index in kf.split(X_np): X_train = X_data[train_index, :] X_test = X_data[test_index, :] y_train = y_data[train_index] y_test = y_data[test_index] model = cuRF(n_estimators=n_estimators, random_state=0) model.fit(X_train, y_train) mse = model_mse(model, X_test, y_test) mse_scores.append(mse) return np.mean(mse_scores) # --------------------------- # 动态评估随机森林树数量(扩展候选数直到CV MSE收敛) # --------------------------- def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000): """ 从 start 开始,每次增加 step,直到连续两次 CV MSE 的差异小于 threshold, 或达到 max_estimators。返回字典 {树数量: CV MSE}。 """ results = {} candidate = start prev_mse = None while candidate <= max_estimators: mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate) results[candidate] = mse print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}") if prev_mse is not None and abs(prev_mse - mse) < threshold: break prev_mse = mse candidate += step return results # 调用动态评估函数 tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000) # 绘制树数量与 CV MSE 关系图并保存 plt.figure(figsize=(8, 5)) plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o') plt.xlabel('Random Forest 树数量 (n_estimators)') plt.ylabel('CV MSE') plt.title('不同树数量下的交叉验证MSE') plt.grid(True) plt.savefig("tree_vs_cv_mse.png", dpi=300) plt.close() # 确定最佳的树数量(CV MSE最小对应的树数) best_n_estimators = min(tree_results, key=tree_results.get) print(f"最佳随机森林树数量确定为: {best_n_estimators}") # --------------------------- # 手动实现 RFECV,并记录置换重要性变化过程 # 这里传入最佳树数量 best_n_estimators 以用于模型训练 # --------------------------- def manual_rfecv(X_data, y_data, cv=5, n_estimators=100): """ 手动实现 RFECV: - 在当前特征子集上计算CV MSE - 使用置换重要性评估各特征的重要性 - 移除置换重要性最低的特征,并记录被移除特征的置换重要性 - 记录使CV MSE最小的特征组合 返回最佳特征组合、最佳CV MSE、历史记录及置换重要性历史。 """ n_features = X_data.shape[1] features = list(range(n_features)) best_score = float('inf') best_features = features.copy() scores_history = [] perm_imp_history = [] # 记录每次剔除的最小置换重要性 while len(features) > 0: current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators) scores_history.append((features.copy(), current_score)) print(f"当前特征数: {len(features)}, CV MSE: {current_score:.4f}") if current_score < best_score: best_score = current_score best_features = features.copy() if len(features) == 1: break # 训练模型并计算置换重要性 model = cuRF(n_estimators=n_estimators, random_state=0) model.fit(X_data[:, features], y_data) imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0) idx_to_remove = int(np.argmin(imp)) removed_feature = features[idx_to_remove] removed_imp = imp[idx_to_remove] perm_imp_history.append(removed_imp) print(f"剔除特征索引: {removed_feature},置换重要性: {removed_imp:.4f}") del features[idx_to_remove] return best_features, best_score, scores_history, perm_imp_history # 执行手动 RFECV,使用5折交叉验证和最佳树数 cv_folds = 5 best_features_rfecv, best_mse_rfecv, history, perm_imp_history = manual_rfecv(X_gpu, y_gpu, cv=cv_folds, n_estimators=best_n_estimators) print(f"\n手动RFECV选择了 {len(best_features_rfecv)} 个特征,CV MSE: {best_mse_rfecv:.4f}") selected_feature_names = [X_df.columns[i] for i in best_features_rfecv] print("选定特征名称:", selected_feature_names) # 绘制置换重要性变化图并保存 plt.figure(figsize=(8, 5)) iterations = list(range(1, len(perm_imp_history) + 1)) plt.plot(iterations, perm_imp_history, marker='o') plt.xlabel('RFECV 迭代次数') plt.ylabel('被剔除特征的置换重要性') plt.title('RFECV过程中置换重要性变化') plt.grid(True) plt.savefig("rfecv_perm_importance.png", dpi=300) plt.close() # --------------------------- # 最终模型训练:使用最佳特征和最佳随机森林树数量训练最终模型 # --------------------------- final_model = cuRF(n_estimators=best_n_estimators, random_state=0) final_model.fit(X_gpu[:, best_features_rfecv], y_gpu) final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators) print(f"\n最终模型(最佳特征、n_estimators={best_n_estimators})CV MSE: {final_cv_mse:.4f}")