Files
qsar/1d-qsar/cuda/qssar1d.py
mm644706215 4cb2d9f56c add 1dqsar
2025-03-03 20:23:09 +08:00

153 lines
6.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file: RFE_cuml_permutation.py
@Description: 使用GPU cuML实现特征选择手动实现RFECV并通过置换重要性计算来剔除最不重要的特征
@Date: 2025/03/01
@Author: lyzeng
'''
import numpy as np
import pandas as pd
# ---------------------------
# 数据加载与特征构造
# ---------------------------
data = pd.read_csv("../data_smi.csv", index_col="Entry ID")
target_data = data[["SMILES", "S.aureus ATCC25923"]]
target_data.columns = ["SMILES", "TARGET"]
from rdkit import Chem
from rdkit.Chem import Descriptors
def getMolDescriptors(smiles: str, missingVal=None):
mol = Chem.MolFromSmiles(smiles)
res = {}
for nm, fn in Descriptors._descList:
try:
val = fn(mol)
except Exception as e:
val = missingVal
res[nm] = val
return res
# 构建特征矩阵与目标变量
X_df = pd.DataFrame([getMolDescriptors(smi) for smi in target_data['SMILES']])
y = target_data['TARGET'].values
# 剔除无效特征(取值唯一的列)
invalid_features = [col for col in X_df.columns if X_df[col].nunique() <= 1]
X_df.drop(columns=invalid_features, inplace=True)
X = X_df.values
print(f"训练样本数: {len(y)}, 特征数量: {X.shape[1]}")
# ---------------------------
# 转换为GPU数据格式cupy数组
# ---------------------------
import cupy as cp
X_gpu = cp.asarray(X)
y_gpu = cp.asarray(y)
# ---------------------------
# 定义辅助函数计算模型的MSE
# ---------------------------
def model_mse(model, X_data, y_data):
y_pred = model.predict(X_data)
mse = cp.mean((y_data - y_pred) ** 2)
return mse.get()
# ---------------------------
# 定义置换重要性计算函数在GPU上
# ---------------------------
def permutation_importance_gpu(model, X_data, y_data, n_repeats=3, random_state=0):
"""
对于给定的模型,计算每个特征的置换重要性:
重要性 = (打乱特征后的MSE均值) - (原始MSE)
随机打乱每个特征的值,并观察模型性能的下降程度来衡量特征的重要性。 如果打乱某个特征导致模型性能显著下降,说明该特征对模型很重要;反之,如果打乱某个特征对模型性能没有明显影响,说明该特征不太重要。
"""
# 先将数据复制到CPU进行打乱操作数据量较小时不会有太大开销
X_cpu = cp.asnumpy(X_data)
baseline = model_mse(model, X_data, y_data)
importances = np.zeros(X_cpu.shape[1])
rng = np.random.RandomState(random_state)
for j in range(X_cpu.shape[1]):
scores = []
for _ in range(n_repeats):
X_permuted = X_cpu.copy()
rng.shuffle(X_permuted[:, j])
X_permuted_gpu = cp.asarray(X_permuted)
score = model_mse(model, X_permuted_gpu, y_data)
scores.append(score)
importances[j] = np.mean(scores) - baseline
return importances
# ---------------------------
# 使用 cuML 随机森林和置换重要性手动实现 RFECV
# ---------------------------
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import KFold
def cross_val_score_gpu(X_data, y_data, cv=5):
"""
使用 KFold 分割数据,利用 cuML 随机森林评估当前特征子集的CV均方误差
"""
kf = KFold(n_splits=cv, shuffle=True, random_state=0)
mse_scores = []
# 使用 CPU 索引进行分割
X_np = cp.asnumpy(X_data)
for train_index, test_index in kf.split(X_np):
X_train = X_data[train_index, :]
X_test = X_data[test_index, :]
y_train = y_data[train_index]
y_test = y_data[test_index]
model = cuRF(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
mse = model_mse(model, X_test, y_test)
mse_scores.append(mse)
return np.mean(mse_scores)
def manual_rfecv(X_data, y_data, cv=5):
"""
手动实现 RFECV:
- 在当前特征子集上计算CV MSE
- 使用置换重要性评估各特征的重要性
- 移除置换重要性最低的特征
- 记录使CV MSE最小的特征组合
找到一个最佳的特征组合,用于训练最终的模型。 这个子集的大小在每次迭代中都在变化,最终目标是找到一个使交叉验证 MSE 最小的子集
cuml.ensemble.RandomForestRegressor 的 max_features 默认值通常是 sqrt (特征总数的平方根)。 这与 scikit-learn 的 RandomForestRegressor 的默认值相同。
"""
n_features = X_data.shape[1]
features = list(range(n_features))
best_score = float('inf')
best_features = features.copy()
scores_history = []
while len(features) > 0:
current_score = cross_val_score_gpu(X_data[:, features], y_data, cv=cv)
scores_history.append((features.copy(), current_score))
print(f"当前特征数: {len(features)}, CV MSE: {current_score:.4f}")
if current_score < best_score:
best_score = current_score
best_features = features.copy()
if len(features) == 1:
break
# 训练模型并计算置换重要性
model = cuRF(n_estimators=100, random_state=0) # cuml.ensemble.RandomForestRegressor 的 max_features 默认值通常是 sqrt (特征总数的平方根)。 这与 scikit-learn 的 RandomForestRegressor 的默认值相同。
model.fit(X_data[:, features], y_data)
imp = permutation_importance_gpu(model, X_data[:, features], y_data, n_repeats=3, random_state=0)
idx_to_remove = int(np.argmin(imp))
removed_feature = features[idx_to_remove]
print(f"剔除特征索引: {removed_feature},置换重要性: {imp[idx_to_remove]:.4f}")
del features[idx_to_remove]
return best_features, best_score, scores_history
# 执行手动 RFECV使用5折交叉验证
cv_folds = 5
best_features_rfecv, best_mse_rfecv, history = manual_rfecv(X_gpu, y_gpu, cv=cv_folds)
print(f"\n手动RFECV选择了 {len(best_features_rfecv)} 个特征CV MSE: {best_mse_rfecv:.4f}")
selected_feature_names = [X_df.columns[i] for i in best_features_rfecv]
print("选定特征名称:", selected_feature_names)