qsar/1d-qsar/cuda/RFE_cuml_permutation.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation.py
@Description: 使用GPU cuML实现特征选择，手动实现RFECV，并通过置换重要性计算来剔除最不重要的特征，
             同时通过交叉验证确定合理的随机森林树数量（动态扩展直到CV MSE收敛），
             并将结果保存为图像，最后使用最佳树数进行训练。
@Date: 2025/03/01
@Author: lyzeng（修改版）
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# 数据加载与特征构造
# ---------------------------
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]

from rdkit import Chem
from rdkit.Chem import Descriptors

def getMolDescriptors(smiles: str, missingVal=None):
    mol = Chem.MolFromSmiles(smiles)
    res = {}
    for nm, fn in Descriptors._descList:
        try:
            val = fn(mol)
        except Exception as e:
            val = missingVal
        res[nm] = val
    return res

# 构建特征矩阵与目标变量
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
y = target_data['TARGET'].values

# 剔除无效特征（取值唯一的列）
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
X_df.drop(columns=invalid_features, inplace=True)
X = X_df.values

print(f"训练样本数: {len(y)}, 特征数量: {X.shape[1]}")

# ---------------------------
# 转换为GPU数据格式（cupy数组）
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)

# ---------------------------
# 定义辅助函数：计算模型的MSE
# ---------------------------
def model_mse(model, X_data, y_data):
    y_pred = model.predict(X_data)
    mse = cp.mean((y_data - y_pred) ** 2)
    return mse.get()

# ---------------------------
# 定义置换重要性计算函数（在GPU上）
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
    """
    对于给定的模型，计算每个特征的置换重要性：
      重要性 = (打乱特征后的MSE均值) - (原始MSE)
    随机打乱每个特征的值，并观察模型性能的下降程度来衡量特征的重要性。
    """
    X_cpu = cp.asnumpy(X_data)  # 将数据复制到CPU进行打乱
    baseline = model_mse(model, X_data, y_data)
    importances = np.zeros(X_cpu.shape[1])
    rng = np.random.RandomState(random_state)
    for j in range(X_cpu.shape[1]):
        scores = []
        for _ in range(n_repeats):
            X_permuted = X_cpu.copy()
            rng.shuffle(X_permuted[:, j])
            X_permuted_gpu = cp.asarray(X_permuted)
            score = model_mse(model, X_permuted_gpu, y_data)
            scores.append(score)
        importances[j] = np.mean(scores) - baseline
    return importances

# ---------------------------
# 修改交叉验证函数：允许传入树的数量
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold

def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
    """
    使用 KFold 分割数据，利用 cuML 随机森林评估当前特征子集的CV均方误差。
    允许设置随机森林的树数量 n_estimators。
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    mse_scores = []
    X_np = cp.asnumpy(X_data)  # 使用CPU索引进行分割
    for train_index, test_index in kf.split(X_np):
        X_train = X_data[train_index, :]
        X_test = X_data[test_index, :]
        y_train = y_data[train_index]
        y_test = y_data[test_index]
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_train, y_train)
        mse = model_mse(model, X_test, y_test)
        mse_scores.append(mse)
    return np.mean(mse_scores)

# ---------------------------
# 动态评估随机森林树数量（扩展候选数直到CV MSE收敛）
# ---------------------------
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
    """
    从 start 开始，每次增加 step，直到连续两次 CV MSE 的差异小于 threshold，
    或达到 max_estimators。返回字典 {树数量: CV MSE}。
    """
    results = {}
    candidate = start
    prev_mse = None
    while candidate <= max_estimators:
        mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
        results[candidate] = mse
        print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
        if prev_mse is not None and abs(prev_mse - mse) < threshold:
            break
        prev_mse = mse
        candidate += step
    return results

# 调用动态评估函数
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)

# 绘制树数量与 CV MSE 关系图并保存
plt.figure(figsize=(8, 5))
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
plt.xlabel('Random Forest 树数量 (n_estimators)')
plt.ylabel('CV MSE')
plt.title('不同树数量下的交叉验证MSE')
plt.grid(True)
plt.savefig("tree_vs_cv_mse.png", dpi=300)
plt.close()

# 确定最佳的树数量（CV MSE最小对应的树数）
best_n_estimators = min(tree_results, key=tree_results.get)
print(f"最佳随机森林树数量确定为: {best_n_estimators}")

# ---------------------------
# 手动实现 RFECV，并记录置换重要性变化过程
# 这里传入最佳树数量 best_n_estimators 以用于模型训练
# ---------------------------
def manual_rfecv(X_data, y_data, cv=5, n_estimators=100):
    """
    手动实现 RFECV:
      - 在当前特征子集上计算CV MSE
      - 使用置换重要性评估各特征的重要性
      - 移除置换重要性最低的特征，并记录被移除特征的置换重要性
      - 记录使CV MSE最小的特征组合
    返回最佳特征组合、最佳CV MSE、历史记录及置换重要性历史。
    """
    n_features = X_data.shape[1]
    features = list(range(n_features))
    best_score = float('inf')
    best_features = features.copy()
    scores_history = []
    perm_imp_history = []  # 记录每次剔除的最小置换重要性

    while len(features) > 0:
        current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
        scores_history.append((features.copy(), current_score))
        print(f"当前特征数: {len(features)}, CV MSE: {current_score:.4f}")
        if current_score < best_score:
            best_score = current_score
            best_features = features.copy()
        if len(features) == 1:
            break
        # 训练模型并计算置换重要性
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_data[:, features], y_data)
        imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
        idx_to_remove = int(np.argmin(imp))
        removed_feature = features[idx_to_remove]
        removed_imp = imp[idx_to_remove]
        perm_imp_history.append(removed_imp)
        print(f"剔除特征索引: {removed_feature}，置换重要性: {removed_imp:.4f}")
        del features[idx_to_remove]

    return best_features, best_score, scores_history, perm_imp_history

# 执行手动 RFECV，使用5折交叉验证和最佳树数
cv_folds = 5
best_features_rfecv, best_mse_rfecv, history, perm_imp_history = manual_rfecv(X_gpu, y_gpu, cv=cv_folds, n_estimators=best_n_estimators)

print(f"\n手动RFECV选择了 {len(best_features_rfecv)} 个特征，CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("选定特征名称：", selected_feature_names)

# 绘制置换重要性变化图并保存
plt.figure(figsize=(8, 5))
iterations = list(range(1, len(perm_imp_history) + 1))
plt.plot(iterations, perm_imp_history, marker='o')
plt.xlabel('RFECV 迭代次数')
plt.ylabel('被剔除特征的置换重要性')
plt.title('RFECV过程中置换重要性变化')
plt.grid(True)
plt.savefig("rfecv_perm_importance.png", dpi=300)
plt.close()

# ---------------------------
# 最终模型训练：使用最佳特征和最佳随机森林树数量训练最终模型
# ---------------------------
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\n最终模型（最佳特征、n_estimators={best_n_estimators}）CV MSE: {final_cv_mse:.4f}")