qsar/1d-qsar/cuda/RFE_cuml_permutation_update.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation.py
@Description: Use GPU cuML to perform feature selection via a manual RFECV procedure,
             removing the least important features based on permutation importance,
             dynamically determining the optimal number of trees (until CV MSE converges),
             saving the plots, and finally training the final model using the best n_estimators.
@Date: 2025/03/01
@Author: lyzeng (modified)
nohup python3 -u RFE_cuml_permutation_update.py > RFE_cuml_permutation_update.log 2>&1 &
'''

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ---------------------------
# Data loading and feature construction
# ---------------------------
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]

from rdkit import Chem
from rdkit.Chem import Descriptors

def getMolDescriptors(smiles: str, missingVal=None):
    mol = Chem.MolFromSmiles(smiles)
    res = {}
    for nm, fn in Descriptors._descList:
        try:
            val = fn(mol)
        except Exception as e:
            val = missingVal
        res[nm] = val
    return res

# Construct feature matrix and target variable
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
y = target_data['TARGET'].values

# Remove invalid features (columns with only one unique value)
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
X_df.drop(columns=invalid_features, inplace=True)
X = X_df.values

print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}")

# ---------------------------
# Convert data to GPU arrays (cupy arrays)
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)

# ---------------------------
# Helper function: Calculate model MSE
# ---------------------------
def model_mse(model, X_data, y_data):
    y_pred = model.predict(X_data)
    mse = cp.mean((y_data - y_pred) ** 2)
    return mse.get()

# ---------------------------
# Permutation importance function (GPU version)
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
    """
    For the given model, compute permutation importance for each feature:
      Importance = (Mean MSE after permuting the feature) - (Original MSE)
    The function randomly shuffles each feature and observes the change in model performance.
    """
    X_cpu = cp.asnumpy(X_data)  # Copy data to CPU for shuffling
    baseline = model_mse(model, X_data, y_data)
    importances = np.zeros(X_cpu.shape[1])
    rng = np.random.RandomState(random_state)
    for j in range(X_cpu.shape[1]):
        scores = []
        for _ in range(n_repeats):
            X_permuted = X_cpu.copy()
            rng.shuffle(X_permuted[:, j])
            X_permuted_gpu = cp.asarray(X_permuted)
            score = model_mse(model, X_permuted_gpu, y_data)
            scores.append(score)
        importances[j] = np.mean(scores) - baseline
    return importances

# ---------------------------
# Modified cross-validation function: allow setting number of trees
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold

def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
    """
    Use KFold to split the data and evaluate the CV MSE of the current feature subset
    using cuML RandomForestRegressor with a specified number of trees.
    """
    kf = KFold(n_splits=cv, shuffle=True, random_state=0)
    mse_scores = []
    X_np = cp.asnumpy(X_data)  # Use CPU indices for splitting
    for train_index, test_index in kf.split(X_np):
        X_train = X_data[train_index, :]
        X_test = X_data[test_index, :]
        y_train = y_data[train_index]
        y_test = y_data[test_index]
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_train, y_train)
        mse = model_mse(model, X_test, y_test)
        mse_scores.append(mse)
    return np.mean(mse_scores)

# ---------------------------
# Dynamically evaluate the number of trees (expand candidate numbers until CV MSE converges)
# ---------------------------
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
    """
    Starting from 'start', increase by 'step' until the difference in consecutive CV MSE is below 'threshold'
    or until max_estimators is reached. Returns a dict {n_estimators: CV MSE}.
    """
    results = {}
    candidate = start
    prev_mse = None
    while candidate <= max_estimators:
        mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
        results[candidate] = mse
        print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
        if prev_mse is not None and abs(prev_mse - mse) < threshold:
            break
        prev_mse = mse
        candidate += step
    return results

# Evaluate the optimal number of trees
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)

# Plot CV MSE vs Number of Trees and save the figure
plt.figure(figsize=(8, 5))
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
plt.xlabel('Random Forest n_estimators')
plt.ylabel('CV MSE')
plt.title('CV MSE vs Number of Trees')
plt.grid(True)
plt.savefig("tree_vs_cv_mse.png", dpi=300)
plt.close()

# Determine the optimal number of trees (the one with the lowest CV MSE)
best_n_estimators = min(tree_results, key=tree_results.get)
print(f"Optimal number of trees determined: {best_n_estimators}")

# ---------------------------
# Manual RFECV implementation with permutation importance
# Now includes printing the feature name (from rdkit Descriptors) along with the index.
# Also records removed feature names for plotting annotation.
# ---------------------------
def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100):
    """
    Manual RFECV:
      - Compute CV MSE on the current feature subset.
      - Compute permutation importance for each feature.
      - Remove the feature with the lowest permutation importance and record it.
      - Record the feature subset that yields the lowest CV MSE.
    Returns the best feature subset, best CV MSE, history, permutation importance history,
    and the list of removed feature names (in the order of removal).
    """
    n_features = X_data.shape[1]
    features = list(range(n_features))
    best_score = float('inf')
    best_features = features.copy()
    scores_history = []
    perm_imp_history = []  # Record the permutation importance of each removed feature
    removed_feature_names = []  # Record the corresponding feature names

    while len(features) > 0:
        current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
        scores_history.append((features.copy(), current_score))
        print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}")
        if current_score < best_score:
            best_score = current_score
            best_features = features.copy()
        if len(features) == 1:
            break
        # Train the model and compute permutation importance
        model = cuRF(n_estimators=n_estimators, random_state=0)
        model.fit(X_data[:, features], y_data)
        imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
        idx_to_remove = int(np.argmin(imp))
        removed_feature = features[idx_to_remove]
        removed_imp = imp[idx_to_remove]
        # Get the feature name using the original feature_names list
        removed_feature_name = feature_names[removed_feature]
        perm_imp_history.append(removed_imp)
        removed_feature_names.append(removed_feature_name)
        print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}")
        del features[idx_to_remove]

    return best_features, best_score, scores_history, perm_imp_history, removed_feature_names

# Execute manual RFECV using 5-fold CV and the optimal number of trees, also pass the feature names
cv_folds = 5
best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv(
    X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators
)

print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("Selected feature names:", selected_feature_names)

# Plot permutation importance change during RFECV with English labels
plt.figure(figsize=(8, 5))
iterations = list(range(1, len(perm_imp_history) + 1))
plt.plot(iterations, perm_imp_history, marker='o')
plt.xlabel('RFECV Iteration')
plt.ylabel('Permutation Importance of Removed Feature')
plt.title('Permutation Importance during RFECV')
plt.grid(True)

# Annotation threshold: annotate points where absolute permutation importance > 0.002
annotation_threshold = 0.001
for i, imp_val in enumerate(perm_imp_history):
    if imp_val > annotation_threshold:
        plt.annotate(removed_feature_names[i],
                     (iterations[i], imp_val),
                     textcoords="offset points",
                     xytext=(0,5),
                     ha='center')

plt.savefig("rfecv_perm_importance.png", dpi=300)
plt.close()

# ---------------------------
# 保存置换重要性数据到字典并序列化为 pkl 文件
# ---------------------------
# 构建字典，只保存置换重要性大于 0.001 的分子属性
perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold}

# 将字典保存到 pickle 文件中
import pickle
with open("perm_importance_dict.pkl", "wb") as f:
    pickle.dump(perm_importance_dict, f)

# 打印字典
print("Permutation Importance Dictionary:", perm_importance_dict)

# ---------------------------
# Final model training: train final model using best feature subset and optimal number of trees
# ---------------------------
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")