236 lines
9.0 KiB
Python
236 lines
9.0 KiB
Python
#!/usr/bin/env python
|
||
# -*- encoding: utf-8 -*-
|
||
'''
|
||
@file: RFE_cuml_permutation_mordred.py
|
||
@Description: 使用GPU cuML进行特征选择的手动RFECV流程,
|
||
采用置换重要性逐步移除不重要特征,同时动态评估随机森林树的数量,
|
||
最后训练最佳模型。特征提取部分使用mordred计算所有分子描述符。
|
||
@Date: 2025/03/04
|
||
@Author: your_name
|
||
'''
|
||
|
||
# 添加猴子补丁,确保 numpy.product 存在
|
||
import numpy as np
|
||
if not hasattr(np, "product"):
|
||
np.product = np.prod
|
||
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
|
||
# ---------------------------
|
||
# 数据加载及特征构造(使用mordred计算所有分子描述符)
|
||
# ---------------------------
|
||
data = pd.read_csv("../../data_smi.csv", index_col="Entry ID")
|
||
target_data = data[["SMILES", "S.aureus ATCC25923"]]
|
||
target_data.columns = ["SMILES", "TARGET"]
|
||
|
||
from multiprocessing import freeze_support
|
||
freeze_support()
|
||
|
||
from rdkit import Chem
|
||
from mordred import Calculator, descriptors, is_missing
|
||
|
||
# 创建mordred计算器,注册所有描述符
|
||
calc = Calculator(descriptors)
|
||
|
||
desc_list = []
|
||
for smi in target_data['SMILES']:
|
||
mol = Chem.MolFromSmiles(smi)
|
||
if mol is None:
|
||
# 若 SMILES 转换失败,则保存空字典
|
||
desc_list.append({})
|
||
continue
|
||
try:
|
||
result = calc(mol)
|
||
# 将计算结果转换为字典形式
|
||
desc_dict = result.asdict()
|
||
except Exception as e:
|
||
desc_dict = {}
|
||
desc_list.append(desc_dict)
|
||
|
||
# 构建描述符 DataFrame
|
||
X_df = pd.DataFrame(desc_list)
|
||
|
||
# 清理:删除全为空的列和只有单一取值的列
|
||
X_df.dropna(axis=1, how='all', inplace=True)
|
||
invalid_features = [col for col in X_df.columns if X_df[col].nunique(dropna=False) <= 1]
|
||
X_df.drop(columns=invalid_features, inplace=True)
|
||
|
||
# 对数值型特征采用均值填充缺失值
|
||
X_df = X_df.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col, axis=0)
|
||
|
||
# 新增:将所有列转换为数值类型,无法转换的将被置为 NaN,再填充缺失值
|
||
X_df = X_df.apply(pd.to_numeric, errors='coerce')
|
||
X_df = X_df.fillna(X_df.mean())
|
||
|
||
X = X_df.values.astype(np.float64) # 确保为数值型数组
|
||
y = target_data['TARGET'].values
|
||
|
||
print(f"Number of training samples: {len(y)}, Number of features: {X.shape[1]}")
|
||
|
||
# ---------------------------
|
||
# 转换为GPU数组(cupy arrays)
|
||
# ---------------------------
|
||
import cupy as cp
|
||
X_gpu = cp.asarray(X)
|
||
y_gpu = cp.asarray(y)
|
||
|
||
# ---------------------------
|
||
# 辅助函数:计算模型的均方误差 (MSE)
|
||
# ---------------------------
|
||
def model_mse(model, X_data, y_data):
|
||
y_pred = model.predict(X_data)
|
||
mse = cp.mean((y_data - y_pred) ** 2)
|
||
return mse.get()
|
||
|
||
# ---------------------------
|
||
# 基于GPU的置换重要性计算函数
|
||
# ---------------------------
|
||
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
|
||
X_cpu = cp.asnumpy(X_data) # 将数据复制到 CPU 用于打乱
|
||
baseline = model_mse(model, X_data, y_data)
|
||
importances = np.zeros(X_cpu.shape[1])
|
||
rng = np.random.RandomState(random_state)
|
||
for j in range(X_cpu.shape[1]):
|
||
scores = []
|
||
for _ in range(n_repeats):
|
||
X_permuted = X_cpu.copy()
|
||
rng.shuffle(X_permuted[:, j])
|
||
X_permuted_gpu = cp.asarray(X_permuted)
|
||
score = model_mse(model, X_permuted_gpu, y_data)
|
||
scores.append(score)
|
||
importances[j] = np.mean(scores) - baseline
|
||
return importances
|
||
|
||
# ---------------------------
|
||
# 修改后的交叉验证函数:允许设定随机森林中树的数量
|
||
# ---------------------------
|
||
from cuml.ensemble import RandomForestRegressor as cuRF
|
||
from sklearn.model_selection import KFold
|
||
|
||
def cross_val_score_gpu(X_data, y_data, cv=5, n_estimators=100):
|
||
kf = KFold(n_splits=cv, shuffle=True, random_state=0)
|
||
mse_scores = []
|
||
X_np = cp.asnumpy(X_data)
|
||
for train_index, test_index in kf.split(X_np):
|
||
X_train = X_data[train_index, :]
|
||
X_test = X_data[test_index, :]
|
||
y_train = y_data[train_index]
|
||
y_test = y_data[test_index]
|
||
model = cuRF(n_estimators=n_estimators, random_state=0)
|
||
model.fit(X_train, y_train)
|
||
mse = model_mse(model, X_test, y_test)
|
||
mse_scores.append(mse)
|
||
return np.mean(mse_scores)
|
||
|
||
# ---------------------------
|
||
# 动态评估树的数量(不断增大候选树数直至CV MSE收敛)
|
||
# ---------------------------
|
||
def evaluate_n_estimators_dynamic(X_data, y_data, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000):
|
||
results = {}
|
||
candidate = start
|
||
prev_mse = None
|
||
while candidate <= max_estimators:
|
||
mse = cross_val_score_gpu(X_data, y_data, cv=cv, n_estimators=candidate)
|
||
results[candidate] = mse
|
||
print(f"n_estimators: {candidate}, CV MSE: {mse:.4f}")
|
||
if prev_mse is not None and abs(prev_mse - mse) < threshold:
|
||
break
|
||
prev_mse = mse
|
||
candidate += step
|
||
return results
|
||
|
||
tree_results = evaluate_n_estimators_dynamic(X_gpu, y_gpu, cv=5, start=50, step=50, threshold=1e-3, max_estimators=1000)
|
||
|
||
plt.figure(figsize=(8, 5))
|
||
plt.plot(list(tree_results.keys()), list(tree_results.values()), marker='o')
|
||
plt.xlabel('Random Forest n_estimators')
|
||
plt.ylabel('CV MSE')
|
||
plt.title('CV MSE vs Number of Trees')
|
||
plt.grid(True)
|
||
plt.savefig("tree_vs_cv_mse.png", dpi=300)
|
||
plt.close()
|
||
|
||
best_n_estimators = min(tree_results, key=tree_results.get)
|
||
print(f"Optimal number of trees determined: {best_n_estimators}")
|
||
|
||
# ---------------------------
|
||
# 手动RFECV实现(基于置换重要性),使用mordred描述符作为特征
|
||
# ---------------------------
|
||
def manual_rfecv(X_data, y_data, feature_names, cv=5, n_estimators=100):
|
||
n_features = X_data.shape[1]
|
||
features = list(range(n_features))
|
||
best_score = float('inf')
|
||
best_features = features.copy()
|
||
scores_history = []
|
||
perm_imp_history = [] # 记录每次移除特征时的置换重要性
|
||
removed_feature_names = [] # 记录被移除的特征名称
|
||
|
||
while len(features) > 0:
|
||
current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv, n_estimators=n_estimators)
|
||
scores_history.append((features.copy(), current_score))
|
||
print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}")
|
||
if current_score < best_score:
|
||
best_score = current_score
|
||
best_features = features.copy()
|
||
if len(features) == 1:
|
||
break
|
||
model = cuRF(n_estimators=n_estimators, random_state=0)
|
||
model.fit(X_data[:, features], y_data)
|
||
imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
|
||
idx_to_remove = int(np.argmin(imp))
|
||
removed_feature = features[idx_to_remove]
|
||
removed_imp = imp[idx_to_remove]
|
||
removed_feature_name = feature_names[removed_feature]
|
||
perm_imp_history.append(removed_imp)
|
||
removed_feature_names.append(removed_feature_name)
|
||
print(f"Removed feature index: {removed_feature}, feature name: {removed_feature_name}, permutation importance: {removed_imp:.4f}")
|
||
del features[idx_to_remove]
|
||
|
||
return best_features, best_score, scores_history, perm_imp_history, removed_feature_names
|
||
|
||
cv_folds = 5
|
||
best_features_rfecv, best_mse_rfecv, history, perm_imp_history, removed_feature_names = manual_rfecv(
|
||
X_gpu, y_gpu, feature_names=list(X_df.columns), cv=cv_folds, n_estimators=best_n_estimators
|
||
)
|
||
|
||
print(f"\nManual RFECV selected {len(best_features_rfecv)} features, CV MSE: {best_mse_rfecv:.4f}")
|
||
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
|
||
print("Selected feature names:", selected_feature_names)
|
||
|
||
plt.figure(figsize=(8, 5))
|
||
iterations = list(range(1, len(perm_imp_history) + 1))
|
||
plt.plot(iterations, perm_imp_history, marker='o')
|
||
plt.xlabel('RFECV Iteration')
|
||
plt.ylabel('Permutation Importance of Removed Feature')
|
||
plt.title('Permutation Importance during RFECV')
|
||
plt.grid(True)
|
||
|
||
annotation_threshold = 0.001
|
||
for i, imp_val in enumerate(perm_imp_history):
|
||
if imp_val > annotation_threshold:
|
||
plt.annotate(removed_feature_names[i],
|
||
(iterations[i], imp_val),
|
||
textcoords="offset points",
|
||
xytext=(0, 5),
|
||
ha='center')
|
||
|
||
plt.savefig("rfecv_perm_importance.png", dpi=300)
|
||
plt.close()
|
||
|
||
perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold}
|
||
|
||
import pickle
|
||
with open("perm_importance_dict.pkl", "wb") as f:
|
||
pickle.dump(perm_importance_dict, f)
|
||
|
||
print("Permutation Importance Dictionary:", perm_importance_dict)
|
||
|
||
# ---------------------------
|
||
# 最终模型训练:使用最佳特征子集和最优的树数训练最终模型
|
||
# ---------------------------
|
||
final_model = cuRF(n_estimators=best_n_estimators, random_state=0)
|
||
final_model.fit(X_gpu[:, best_features_rfecv], y_gpu)
|
||
final_cv_mse = cross_val_score_gpu(X_gpu[:, best_features_rfecv], y_gpu, cv=cv_folds, n_estimators=best_n_estimators)
|
||
print(f"\nFinal model (selected features, n_estimators={best_n_estimators}) CV MSE: {final_cv_mse:.4f}")
|