253 lines
10 KiB
Python
253 lines
10 KiB
Python
#!/usr/bin/env python
|
|
# -*- encoding: utf-8 -*-
|
|
'''
|
|
@file: RFE_cuml_permutation.py
|
|
@Description: Use GPU cuML to perform feature selection via a manual RFECV procedure,
|
|
removing the least important features based on permutation importance,
|
|
dynamically determining the optimal number of trees (until CV MSE converges),
|
|
saving the plots, and finally training the final model using the best n_estimators.
|
|
@Date: 2025/03/01
|
|
@Author: lyzeng (modified)
|
|
nohup python3 -u RFE_cuml_permutation_update.py > RFE_cuml_permutation_update.log 2>&1 &
|
|
'''
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
# ---------------------------
|
|
# Data loading and feature construction
|
|
# ---------------------------
|
|
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
|
|
target_data = data[["SMILES", "S.aureus ATCC25923"]]
|
|
target_data.columns = ["SMILES", "TARGET"]
|
|
|
|
from rdkit import Chem
|
|
from rdkit.Chem import Descriptors
|
|
|
|
def getMolDescriptors(smiles: str, missingVal=None):
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
res = {}
|
|
for nm, fn in Descriptors._descList:
|
|
try:
|
|
val = fn(mol)
|
|
except Exception as e:
|
|
val = missingVal
|
|
res[nm] = val
|
|
return res
|
|
|
|
# Construct feature matrix and target variable
|
|
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
|
|
y = target_data['TARGET'].values
|
|
|
|
# Remove invalid features (columns with only one unique value)
|
|
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
|
|
X_df.drop(columns=invalid_features, inplace=True)
|
|
X = X_df.values
|
|
|
|
print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}")
|
|
|
|
# ---------------------------
|
|
# Convert data to GPU arrays (cupy arrays)
|
|
# ---------------------------
|
|
import cupy as cp
|
|
X_gpu = cp.asarray(X)
|
|
y_gpu = cp.asarray(y)
|
|
|
|
# ---------------------------
|
|
# Helper function: Calculate model MSE
|
|
# ---------------------------
|
|
def model_mse(model, X_data, y_data):
|
|
y_pred = model.predict(X_data)
|
|
mse = cp.mean((y_data - y_pred) ** 2)
|
|
return mse.get()
|
|
|
|
# ---------------------------
|
|
# Permutation importance function (GPU version)
|
|
# ---------------------------
|
|
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
|
|
"""
|
|
For the given model, compute permutation importance for each feature:
|
|
Importance = (Mean MSE after permuting the feature) - (Original MSE)
|
|
The function randomly shuffles each feature and observes the change in model performance.
|
|
"""
|
|
X_cpu = cp.asnumpy(X_data) # Copy data to CPU for shuffling
|
|
baseline = model_mse(model, X_data, y_data)
|
|
importances = np.zeros(X_cpu.shape[1])
|
|
rng = np.random.RandomState(random_state)
|
|
for j in range(X_cpu.shape[1]):
|
|
scores = []
|
|
for _ in range(n_repeats):
|
|
X_permuted = X_cpu.copy()
|
|
rng.shuffle(X_permuted[:, j])
|
|
X_permuted_gpu = cp.asarray(X_permuted)
|
|
score = model_mse(model, X_permuted_gpu, y_data)
|
|
scores.append(score)
|
|
importances[j] = np.mean(scores) - baseline
|
|
return importances
|
|
|
|
# ---------------------------
|
|
# Modified cross-validation function: allow setting number of trees
|
|
# ---------------------------
|
|
from cuml.ensemble import RandomForestRegressor as cuRF
|
|
from sklearn.model_selection import KFold
|
|
|
|
def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
|
|
"""
|
|
Use KFold to split the data and evaluate the CV MSE of the current feature subset
|
|
using cuML RandomForestRegressor with a specified number of trees.
|
|
"""
|
|
kf = KFold(n_splits=cv, shuffle=True, random_state=0)
|
|
mse_scores = []
|
|
X_np = cp.asnumpy(X_data) # Use CPU indices for splitting
|
|
for train_index, test_index in kf.split(X_np):
|
|
X_train = X_data[train_index, :]
|
|
X_test = X_data[test_index, :]
|
|
y_train = y_data[train_index]
|
|
y_test = y_data[test_index]
|
|
model = cuRF(n_estimators=n_estimators, random_state=0)
|
|
model.fit(X_train, y_train)
|
|
mse = model_mse(model, X_test, y_test)
|
|
mse_scores.append(mse)
|
|
return np.mean(mse_scores)
|
|
|
|
# ---------------------------
|
|
# Dynamically evaluate the number of trees (expand candidate numbers until CV MSE converges)
|
|
# ---------------------------
|
|
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
|
|
"""
|
|
Starting from 'start', increase by 'step' until the difference in consecutive CV MSE is below 'threshold'
|
|
or until max_estimators is reached. Returns a dict {n_estimators: CV MSE}.
|
|
"""
|
|
results = {}
|
|
candidate = start
|
|
prev_mse = None
|
|
while candidate <= max_estimators:
|
|
mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
|
|
results[candidate] = mse
|
|
print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
|
|
if prev_mse is not None and abs(prev_mse - mse) < threshold:
|
|
break
|
|
prev_mse = mse
|
|
candidate += step
|
|
return results
|
|
|
|
# Evaluate the optimal number of trees
|
|
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)
|
|
|
|
# Plot CV MSE vs Number of Trees and save the figure
|
|
plt.figure(figsize=(8, 5))
|
|
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
|
|
plt.xlabel('Random Forest n_estimators')
|
|
plt.ylabel('CV MSE')
|
|
plt.title('CV MSE vs Number of Trees')
|
|
plt.grid(True)
|
|
plt.savefig("tree_vs_cv_mse.png", dpi=300)
|
|
plt.close()
|
|
|
|
# Determine the optimal number of trees (the one with the lowest CV MSE)
|
|
best_n_estimators = min(tree_results, key=tree_results.get)
|
|
print(f"Optimal number of trees determined: {best_n_estimators}")
|
|
|
|
# ---------------------------
|
|
# Manual RFECV implementation with permutation importance
|
|
# Now includes printing the feature name (from rdkit Descriptors) along with the index.
|
|
# Also records removed feature names for plotting annotation.
|
|
# ---------------------------
|
|
def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100):
|
|
"""
|
|
Manual RFECV:
|
|
- Compute CV MSE on the current feature subset.
|
|
- Compute permutation importance for each feature.
|
|
- Remove the feature with the lowest permutation importance and record it.
|
|
- Record the feature subset that yields the lowest CV MSE.
|
|
Returns the best feature subset, best CV MSE, history, permutation importance history,
|
|
and the list of removed feature names (in the order of removal).
|
|
"""
|
|
n_features = X_data.shape[1]
|
|
features = list(range(n_features))
|
|
best_score = float('inf')
|
|
best_features = features.copy()
|
|
scores_history = []
|
|
perm_imp_history = [] # Record the permutation importance of each removed feature
|
|
removed_feature_names = [] # Record the corresponding feature names
|
|
|
|
while len(features) > 0:
|
|
current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
|
|
scores_history.append((features.copy(), current_score))
|
|
print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}")
|
|
if current_score < best_score:
|
|
best_score = current_score
|
|
best_features = features.copy()
|
|
if len(features) == 1:
|
|
break
|
|
# Train the model and compute permutation importance
|
|
model = cuRF(n_estimators=n_estimators, random_state=0)
|
|
model.fit(X_data[:, features], y_data)
|
|
imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
|
|
idx_to_remove = int(np.argmin(imp))
|
|
removed_feature = features[idx_to_remove]
|
|
removed_imp = imp[idx_to_remove]
|
|
# Get the feature name using the original feature_names list
|
|
removed_feature_name = feature_names[removed_feature]
|
|
perm_imp_history.append(removed_imp)
|
|
removed_feature_names.append(removed_feature_name)
|
|
print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}")
|
|
del features[idx_to_remove]
|
|
|
|
return best_features, best_score, scores_history, perm_imp_history, removed_feature_names
|
|
|
|
# Execute manual RFECV using 5-fold CV and the optimal number of trees, also pass the feature names
|
|
cv_folds = 5
|
|
best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv(
|
|
X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators
|
|
)
|
|
|
|
print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}")
|
|
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
|
|
print("Selected feature names:", selected_feature_names)
|
|
|
|
# Plot permutation importance change during RFECV with English labels
|
|
plt.figure(figsize=(8, 5))
|
|
iterations = list(range(1, len(perm_imp_history) + 1))
|
|
plt.plot(iterations, perm_imp_history, marker='o')
|
|
plt.xlabel('RFECV Iteration')
|
|
plt.ylabel('Permutation Importance of Removed Feature')
|
|
plt.title('Permutation Importance during RFECV')
|
|
plt.grid(True)
|
|
|
|
# Annotation threshold: annotate points where absolute permutation importance > 0.002
|
|
annotation_threshold = 0.001
|
|
for i, imp_val in enumerate(perm_imp_history):
|
|
if imp_val > annotation_threshold:
|
|
plt.annotate(removed_feature_names[i],
|
|
(iterations[i], imp_val),
|
|
textcoords="offset points",
|
|
xytext=(0,5),
|
|
ha='center')
|
|
|
|
plt.savefig("rfecv_perm_importance.png", dpi=300)
|
|
plt.close()
|
|
|
|
# ---------------------------
|
|
# 保存置换重要性数据到字典并序列化为 pkl 文件
|
|
# ---------------------------
|
|
# 构建字典,只保存置换重要性大于 0.001 的分子属性
|
|
perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold}
|
|
|
|
# 将字典保存到 pickle 文件中
|
|
import pickle
|
|
with open("perm_importance_dict.pkl", "wb") as f:
|
|
pickle.dump(perm_importance_dict, f)
|
|
|
|
# 打印字典
|
|
print("Permutation Importance Dictionary:", perm_importance_dict)
|
|
|
|
# ---------------------------
|
|
# Final model training: train final model using best feature subset and optimal number of trees
|
|
# ---------------------------
|
|
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
|
|
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
|
|
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
|
|
print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")
|