Files
qsar/1d-qsar/cuda/RFE_cuml_permutation_update.py
mm644706215 818f461beb update
2025-03-03 22:34:33 +08:00

253 lines
10 KiB
Python

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation.py
@Description: Use GPU cuML to perform feature selection via a manual RFECV procedure,
removing the least important features based on permutation importance,
dynamically determining the optimal number of trees (until CV MSE converges),
saving the plots, and finally training the final model using the best n_estimators.
@Date: 2025/03/01
@Author: lyzeng (modified)
nohup python3 -u RFE_cuml_permutation_update.py > RFE_cuml_permutation_update.log 2>&1 &
'''
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ---------------------------
# Data loading and feature construction
# ---------------------------
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]
from rdkit import Chem
from rdkit.Chem import Descriptors
def getMolDescriptors(smiles: str, missingVal=None):
mol = Chem.MolFromSmiles(smiles)
res = {}
for nm, fn in Descriptors._descList:
try:
val = fn(mol)
except Exception as e:
val = missingVal
res[nm] = val
return res
# Construct feature matrix and target variable
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
y = target_data['TARGET'].values
# Remove invalid features (columns with only one unique value)
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
X_df.drop(columns=invalid_features, inplace=True)
X = X_df.values
print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}")
# ---------------------------
# Convert data to GPU arrays (cupy arrays)
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)
# ---------------------------
# Helper function: Calculate model MSE
# ---------------------------
def model_mse(model, X_data, y_data):
y_pred = model.predict(X_data)
mse = cp.mean((y_data - y_pred) ** 2)
return mse.get()
# ---------------------------
# Permutation importance function (GPU version)
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
"""
For the given model, compute permutation importance for each feature:
Importance = (Mean MSE after permuting the feature) - (Original MSE)
The function randomly shuffles each feature and observes the change in model performance.
"""
X_cpu = cp.asnumpy(X_data) # Copy data to CPU for shuffling
baseline = model_mse(model, X_data, y_data)
importances = np.zeros(X_cpu.shape[1])
rng = np.random.RandomState(random_state)
for j in range(X_cpu.shape[1]):
scores = []
for _ in range(n_repeats):
X_permuted = X_cpu.copy()
rng.shuffle(X_permuted[:, j])
X_permuted_gpu = cp.asarray(X_permuted)
score = model_mse(model, X_permuted_gpu, y_data)
scores.append(score)
importances[j] = np.mean(scores) - baseline
return importances
# ---------------------------
# Modified cross-validation function: allow setting number of trees
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold
def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
"""
Use KFold to split the data and evaluate the CV MSE of the current feature subset
using cuML RandomForestRegressor with a specified number of trees.
"""
kf = KFold(n_splits=cv, shuffle=True, random_state=0)
mse_scores = []
X_np = cp.asnumpy(X_data) # Use CPU indices for splitting
for train_index, test_index in kf.split(X_np):
X_train = X_data[train_index, :]
X_test = X_data[test_index, :]
y_train = y_data[train_index]
y_test = y_data[test_index]
model = cuRF(n_estimators=n_estimators, random_state=0)
model.fit(X_train, y_train)
mse = model_mse(model, X_test, y_test)
mse_scores.append(mse)
return np.mean(mse_scores)
# ---------------------------
# Dynamically evaluate the number of trees (expand candidate numbers until CV MSE converges)
# ---------------------------
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
"""
Starting from 'start', increase by 'step' until the difference in consecutive CV MSE is below 'threshold'
or until max_estimators is reached. Returns a dict {n_estimators: CV MSE}.
"""
results = {}
candidate = start
prev_mse = None
while candidate <= max_estimators:
mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
results[candidate] = mse
print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
if prev_mse is not None and abs(prev_mse - mse) < threshold:
break
prev_mse = mse
candidate += step
return results
# Evaluate the optimal number of trees
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)
# Plot CV MSE vs Number of Trees and save the figure
plt.figure(figsize=(8, 5))
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
plt.xlabel('Random Forest n_estimators')
plt.ylabel('CV MSE')
plt.title('CV MSE vs Number of Trees')
plt.grid(True)
plt.savefig("tree_vs_cv_mse.png", dpi=300)
plt.close()
# Determine the optimal number of trees (the one with the lowest CV MSE)
best_n_estimators = min(tree_results, key=tree_results.get)
print(f"Optimal number of trees determined: {best_n_estimators}")
# ---------------------------
# Manual RFECV implementation with permutation importance
# Now includes printing the feature name (from rdkit Descriptors) along with the index.
# Also records removed feature names for plotting annotation.
# ---------------------------
def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100):
"""
Manual RFECV:
- Compute CV MSE on the current feature subset.
- Compute permutation importance for each feature.
- Remove the feature with the lowest permutation importance and record it.
- Record the feature subset that yields the lowest CV MSE.
Returns the best feature subset, best CV MSE, history, permutation importance history,
and the list of removed feature names (in the order of removal).
"""
n_features = X_data.shape[1]
features = list(range(n_features))
best_score = float('inf')
best_features = features.copy()
scores_history = []
perm_imp_history = [] # Record the permutation importance of each removed feature
removed_feature_names = [] # Record the corresponding feature names
while len(features) > 0:
current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
scores_history.append((features.copy(), current_score))
print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}")
if current_score < best_score:
best_score = current_score
best_features = features.copy()
if len(features) == 1:
break
# Train the model and compute permutation importance
model = cuRF(n_estimators=n_estimators, random_state=0)
model.fit(X_data[:, features], y_data)
imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
idx_to_remove = int(np.argmin(imp))
removed_feature = features[idx_to_remove]
removed_imp = imp[idx_to_remove]
# Get the feature name using the original feature_names list
removed_feature_name = feature_names[removed_feature]
perm_imp_history.append(removed_imp)
removed_feature_names.append(removed_feature_name)
print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}")
del features[idx_to_remove]
return best_features, best_score, scores_history, perm_imp_history, removed_feature_names
# Execute manual RFECV using 5-fold CV and the optimal number of trees, also pass the feature names
cv_folds = 5
best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv(
X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators
)
print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("Selected feature names:", selected_feature_names)
# Plot permutation importance change during RFECV with English labels
plt.figure(figsize=(8, 5))
iterations = list(range(1, len(perm_imp_history) + 1))
plt.plot(iterations, perm_imp_history, marker='o')
plt.xlabel('RFECV Iteration')
plt.ylabel('Permutation Importance of Removed Feature')
plt.title('Permutation Importance during RFECV')
plt.grid(True)
# Annotation threshold: annotate points where absolute permutation importance > 0.002
annotation_threshold = 0.001
for i, imp_val in enumerate(perm_imp_history):
if imp_val > annotation_threshold:
plt.annotate(removed_feature_names[i],
(iterations[i], imp_val),
textcoords="offset points",
xytext=(0,5),
ha='center')
plt.savefig("rfecv_perm_importance.png", dpi=300)
plt.close()
# ---------------------------
# 保存置换重要性数据到字典并序列化为 pkl 文件
# ---------------------------
# 构建字典,只保存置换重要性大于 0.001 的分子属性
perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold}
# 将字典保存到 pickle 文件中
import pickle
with open("perm_importance_dict.pkl", "wb") as f:
pickle.dump(perm_importance_dict, f)
# 打印字典
print("Permutation Importance Dictionary:", perm_importance_dict)
# ---------------------------
# Final model training: train final model using best feature subset and optimal number of trees
# ---------------------------
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")