#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file: RFE_cuml_permutation.py @Description: Use GPU cuML to perform feature selection via a manual RFECV procedure, removing the least important features based on permutation importance, dynamically determining the optimal number of trees (until CV MSE converges), saving the plots, and finally training the final model using the best n_estimators. @Date: 2025/03/01 @Author: lyzeng (modified) nohup python3 -u RFE_cuml_permutation_update.py > RFE_cuml_permutation_update.log 2>&1 & ''' import numpy as np import pandas as pd import matplotlib.pyplot as plt # --------------------------- # Data loading and feature construction # --------------------------- data = pd.read_csv("../data_smi.csv", index_col="Entry ID") target_data = data[["SMILES", "S.aureus ATCC25923"]] target_data.columns = ["SMILES", "TARGET"] from rdkit import Chem from rdkit.Chem import Descriptors def getMolDescriptors(smiles: str, missingVal=None): mol = Chem.MolFromSmiles(smiles) res = {} for nm, fn in Descriptors._descList: try: val = fn(mol) except Exception as e: val = missingVal res[nm] = val return res # Construct feature matrix and target variable X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']]) y = target_data['TARGET'].values # Remove invalid features (columns with only one unique value) invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1] X_df.drop(columns=invalid_features, inplace=True) X = X_df.values print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}") # --------------------------- # Convert data to GPU arrays (cupy arrays) # --------------------------- import cupy as cp X_gpu = cp.asarray(X) y_gpu = cp.asarray(y) # --------------------------- # Helper function: Calculate model MSE # --------------------------- def model_mse(model, X_data, y_data): y_pred = model.predict(X_data) mse = cp.mean((y_data - y_pred) ** 2) return mse.get() # --------------------------- # Permutation importance function (GPU version) # --------------------------- def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0): """ For the given model, compute permutation importance for each feature: Importance = (Mean MSE after permuting the feature) - (Original MSE) The function randomly shuffles each feature and observes the change in model performance. """ X_cpu = cp.asnumpy(X_data) # Copy data to CPU for shuffling baseline = model_mse(model, X_data, y_data) importances = np.zeros(X_cpu.shape[1]) rng = np.random.RandomState(random_state) for j in range(X_cpu.shape[1]): scores = [] for _ in range(n_repeats): X_permuted = X_cpu.copy() rng.shuffle(X_permuted[:, j]) X_permuted_gpu = cp.asarray(X_permuted) score = model_mse(model, X_permuted_gpu, y_data) scores.append(score) importances[j] = np.mean(scores) - baseline return importances # --------------------------- # Modified cross-validation function: allow setting number of trees # --------------------------- from cuml.ensemble import RandomForestRegressor as cuRF from sklearn.model_selection import KFold def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100): """ Use KFold to split the data and evaluate the CV MSE of the current feature subset using cuML RandomForestRegressor with a specified number of trees. """ kf = KFold(n_splits=cv, shuffle=True, random_state=0) mse_scores = [] X_np = cp.asnumpy(X_data) # Use CPU indices for splitting for train_index, test_index in kf.split(X_np): X_train = X_data[train_index, :] X_test = X_data[test_index, :] y_train = y_data[train_index] y_test = y_data[test_index] model = cuRF(n_estimators=n_estimators, random_state=0) model.fit(X_train, y_train) mse = model_mse(model, X_test, y_test) mse_scores.append(mse) return np.mean(mse_scores) # --------------------------- # Dynamically evaluate the number of trees (expand candidate numbers until CV MSE converges) # --------------------------- def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000): """ Starting from 'start', increase by 'step' until the difference in consecutive CV MSE is below 'threshold' or until max_estimators is reached. Returns a dict {n_estimators: CV MSE}. """ results = {} candidate = start prev_mse = None while candidate <= max_estimators: mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate) results[candidate] = mse print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}") if prev_mse is not None and abs(prev_mse - mse) < threshold: break prev_mse = mse candidate += step return results # Evaluate the optimal number of trees tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000) # Plot CV MSE vs Number of Trees and save the figure plt.figure(figsize=(8, 5)) plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o') plt.xlabel('Random Forest n_estimators') plt.ylabel('CV MSE') plt.title('CV MSE vs Number of Trees') plt.grid(True) plt.savefig("tree_vs_cv_mse.png", dpi=300) plt.close() # Determine the optimal number of trees (the one with the lowest CV MSE) best_n_estimators = min(tree_results, key=tree_results.get) print(f"Optimal number of trees determined: {best_n_estimators}") # --------------------------- # Manual RFECV implementation with permutation importance # Now includes printing the feature name (from rdkit Descriptors) along with the index. # Also records removed feature names for plotting annotation. # --------------------------- def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100): """ Manual RFECV: - Compute CV MSE on the current feature subset. - Compute permutation importance for each feature. - Remove the feature with the lowest permutation importance and record it. - Record the feature subset that yields the lowest CV MSE. Returns the best feature subset, best CV MSE, history, permutation importance history, and the list of removed feature names (in the order of removal). """ n_features = X_data.shape[1] features = list(range(n_features)) best_score = float('inf') best_features = features.copy() scores_history = [] perm_imp_history = [] # Record the permutation importance of each removed feature removed_feature_names = [] # Record the corresponding feature names while len(features) > 0: current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators) scores_history.append((features.copy(), current_score)) print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}") if current_score < best_score: best_score = current_score best_features = features.copy() if len(features) == 1: break # Train the model and compute permutation importance model = cuRF(n_estimators=n_estimators, random_state=0) model.fit(X_data[:, features], y_data) imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0) idx_to_remove = int(np.argmin(imp)) removed_feature = features[idx_to_remove] removed_imp = imp[idx_to_remove] # Get the feature name using the original feature_names list removed_feature_name = feature_names[removed_feature] perm_imp_history.append(removed_imp) removed_feature_names.append(removed_feature_name) print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}") del features[idx_to_remove] return best_features, best_score, scores_history, perm_imp_history, removed_feature_names # Execute manual RFECV using 5-fold CV and the optimal number of trees, also pass the feature names cv_folds = 5 best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv( X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators ) print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}") selected_feature_names = [X_df.columns[i] for i in best_features_rfecv] print("Selected feature names:", selected_feature_names) # Plot permutation importance change during RFECV with English labels plt.figure(figsize=(8, 5)) iterations = list(range(1, len(perm_imp_history) + 1)) plt.plot(iterations, perm_imp_history, marker='o') plt.xlabel('RFECV Iteration') plt.ylabel('Permutation Importance of Removed Feature') plt.title('Permutation Importance during RFECV') plt.grid(True) # Annotation threshold: annotate points where absolute permutation importance > 0.002 annotation_threshold = 0.001 for i, imp_val in enumerate(perm_imp_history): if imp_val > annotation_threshold: plt.annotate(removed_feature_names[i], (iterations[i], imp_val), textcoords="offset points", xytext=(0,5), ha='center') plt.savefig("rfecv_perm_importance.png", dpi=300) plt.close() # --------------------------- # 保存置换重要性数据到字典并序列化为 pkl 文件 # --------------------------- # 构建字典,只保存置换重要性大于 0.001 的分子属性 perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold} # 将字典保存到 pickle 文件中 import pickle with open("perm_importance_dict.pkl", "wb") as f: pickle.dump(perm_importance_dict, f) # 打印字典 print("Permutation Importance Dictionary:", perm_importance_dict) # --------------------------- # Final model training: train final model using best feature subset and optimal number of trees # --------------------------- final_model = cuRF(n_estimators=best_n_estimators, random_state=0) final_model.fit(X_gpu[:, best_features_rfecv], y_gpu) final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators) print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")