qsar/1d-qsar/cuda/mordred/RFE_cuml_permutation_mordred.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation_mordred.py
@Description: 使用GPU cuML进行特征选择的手动RFECV流程，
             采用置换重要性逐步移除不重要特征，同时动态评估随机森林树的数量，
             最后训练最佳模型。特征提取部分使用mordred计算所有分子描述符。
@Date: 2025/03/04
@Author: your_name
'''

# 添加猴子补丁，确保 numpy.product 存在
import numpy as np
if not hasattr(np, "product"):
    np.product = np.prod

import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# 数据加载及特征构造（使用mordred计算所有分子描述符）
# ---------------------------
data = pd.read_csv("../../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]

from multiprocessing import freeze_support
freeze_support()

from rdkit import Chem
from mordred import Calculator, descriptors, is_missing

# 创建mordred计算器，注册所有描述符
calc = Calculator(descriptors)

desc_list = []
for smi in target_data['SMILES']:
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        # 若 SMILES 转换失败，则保存空字典
        desc_list.append({})
        continue
    try:
        result = calc(mol)
        # 将计算结果转换为字典形式
        desc_dict = result.asdict()
    except Exception as e:
        desc_dict = {}
    desc_list.append(desc_dict)

# 构建描述符 DataFrame
X_df = pd.DataFrame(desc_list)

# 清理：删除全为空的列和只有单一取值的列
X_df.dropna(axis=1, how='all', inplace=True)
invalid_features = [col for col in X_df.columns if X_df[col].nunique(dropna=False) <= 1]
X_df.drop(columns=invalid_features, inplace=True)

# 对数值型特征采用均值填充缺失值
X_df = X_df.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col, axis=0)

# 新增：将所有列转换为数值类型，无法转换的将被置为 NaN，再填充缺失值
X_df = X_df.apply(pd.to_numeric, errors='coerce')
X_df = X_df.fillna(X_df.mean())

X = X_df.values.astype(np.float64)  # 确保为数值型数组
y = target_data['TARGET'].values

print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}")

# ---------------------------
# 转换为GPU数组（cupy arrays）
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)

# ---------------------------
# 辅助函数：计算模型的均方误差 (MSE)
# ---------------------------
def model_mse(model, X_data, y_data):
    y_pred = model.predict(X_data)
    mse = cp.mean((y_data - y_pred) ** 2)
    return mse.get()

# ---------------------------
# 基于GPU的置换重要性计算函数
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
    X_cpu = cp.asnumpy(X_data)  # 将数据复制到 CPU 用于打乱
    baseline = model_mse(model, X_data, y_data)
    importances = np.zeros(X_cpu.shape[1])
    rng = np.random.RandomState(random_state)
    for j in range(X_cpu.shape[1]):
        scores = []
        for _ in range(n_repeats):
            X_permuted = X_cpu.copy()
            rng.shuffle(X_permuted[:, j])
            X_permuted_gpu = cp.asarray(X_permuted)
            score = model_mse(model, X_permuted_gpu, y_data)
            scores.append(score)
        importances[j] = np.mean(scores) - baseline
    return importances

# ---------------------------
# 修改后的交叉验证函数：允许设定随机森林中树的数量
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold

def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    mse_scores = []
    X_np = cp.asnumpy(X_data)
    for train_index, test_index in kf.split(X_np):
        X_train = X_data[train_index, :]
        X_test = X_data[test_index, :]
        y_train = y_data[train_index]
        y_test = y_data[test_index]
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_train, y_train)
        mse = model_mse(model, X_test, y_test)
        mse_scores.append(mse)
    return np.mean(mse_scores)

# ---------------------------
# 动态评估树的数量（不断增大候选树数直至CV MSE收敛）
# ---------------------------
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
    results = {}
    candidate = start
    prev_mse = None
    while candidate <= max_estimators:
        mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
        results[candidate] = mse
        print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
        if prev_mse is not None and abs(prev_mse - mse) < threshold:
            break
        prev_mse = mse
        candidate += step
    return results

tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)

plt.figure(figsize=(8, 5))
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
plt.xlabel('Random Forest n_estimators')
plt.ylabel('CV MSE')
plt.title('CV MSE vs Number of Trees')
plt.grid(True)
plt.savefig("tree_vs_cv_mse.png", dpi=300)
plt.close()

best_n_estimators = min(tree_results, key=tree_results.get)
print(f"Optimal number of trees determined: {best_n_estimators}")

# ---------------------------
# 手动RFECV实现（基于置换重要性），使用mordred描述符作为特征
# ---------------------------
def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100):
    n_features = X_data.shape[1]
    features = list(range(n_features))
    best_score = float('inf')
    best_features = features.copy()
    scores_history = []
    perm_imp_history = []  # 记录每次移除特征时的置换重要性
    removed_feature_names = []  # 记录被移除的特征名称

    while len(features) > 0:
        current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
        scores_history.append((features.copy(), current_score))
        print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}")
        if current_score < best_score:
            best_score = current_score
            best_features = features.copy()
        if len(features) == 1:
            break
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_data[:, features], y_data)
        imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
        idx_to_remove = int(np.argmin(imp))
        removed_feature = features[idx_to_remove]
        removed_imp = imp[idx_to_remove]
        removed_feature_name = feature_names[removed_feature]
        perm_imp_history.append(removed_imp)
        removed_feature_names.append(removed_feature_name)
        print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}")
        del features[idx_to_remove]

    return best_features, best_score, scores_history, perm_imp_history, removed_feature_names

cv_folds = 5
best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv(
    X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators
)

print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("Selected feature names:", selected_feature_names)

plt.figure(figsize=(8, 5))
iterations = list(range(1, len(perm_imp_history) + 1))
plt.plot(iterations, perm_imp_history, marker='o')
plt.xlabel('RFECV Iteration')
plt.ylabel('Permutation Importance of Removed Feature')
plt.title('Permutation Importance during RFECV')
plt.grid(True)

annotation_threshold = 0.001
for i, imp_val in enumerate(perm_imp_history):
    if imp_val > annotation_threshold:
        plt.annotate(removed_feature_names[i],
                     (iterations[i], imp_val),
                     textcoords="offset points",
                     xytext=(0, 5),
                     ha='center')

plt.savefig("rfecv_perm_importance.png", dpi=300)
plt.close()

perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold}

import pickle
with open("perm_importance_dict.pkl", "wb") as f:
    pickle.dump(perm_importance_dict, f)

print("Permutation Importance Dictionary:", perm_importance_dict)

# ---------------------------
# 最终模型训练：使用最佳特征子集和最优的树数训练最终模型
# ---------------------------
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")