From 818f461bebcbbab87dba67084bf2e8ff1c2c7c26 Mon Sep 17 00:00:00 2001 From: mm644706215 Date: Mon, 3 Mar 2025 22:34:33 +0800 Subject: [PATCH] update --- 1d-qsar/cuda/RFECV流程.md | 86 ++++++ 1d-qsar/cuda/RFE_cuml_permutation_update.log | 295 +++++++++++++++++++ 1d-qsar/cuda/RFE_cuml_permutation_update.py | 24 +- 1d-qsar/cuda/extract_log_CV_MSE.py | 49 +++ 4 files changed, 451 insertions(+), 3 deletions(-) create mode 100644 1d-qsar/cuda/RFECV流程.md create mode 100644 1d-qsar/cuda/extract_log_CV_MSE.py diff --git a/1d-qsar/cuda/RFECV流程.md b/1d-qsar/cuda/RFECV流程.md new file mode 100644 index 0000000..0b4e85a --- /dev/null +++ b/1d-qsar/cuda/RFECV流程.md @@ -0,0 +1,86 @@ +## 置换重要性计算原理与 RFECV 完整流程 + +### 置换重要性的计算公式与原理 +计算公式: + +置换重要性 = 平均打乱后的 MSE − 原始(基线)MSE +原理说明: + +基线性能:首先在原始数据上计算模型的均方误差(MSE),作为基线性能。 +打乱特征:对于每个特征,重复 n 次将该特征的数值随机打乱(破坏其与目标变量的关联),然后计算模型在打乱后数据上的 MSE。 +重要性度量:打乱某个特征后,如果模型性能恶化(MSE 增加),说明该特征对预测有较强贡献;反之,如果 MSE 变化不大甚至降低,则说明该特征的重要性较低。 +因此,置换重要性反映了打乱一个特征后模型性能的下降程度——数值越大(正值越大),表明该特征越重要;若为负值,则可能暗示该特征对模型存在“干扰”作用(或模型过拟合某些噪声)。 + +### RFECV 的完整流程 +RFECV(递归特征消除交叉验证)在代码中的实现是一个贪心的特征筛选过程,完整流程如下: + +初始特征集 + +从原始数据 $X$ 的所有特征开始,构成完整的特征集。 +模型性能评估 + +利用当前的特征集,通过交叉验证(例如 5 折)计算模型的平均 MSE,作为当前特征组合的基线性能。 +计算置换重要性 + +针对当前特征集,训练一个随机森林模型。 +对于每个特征: +将该特征的数值在验证数据中随机打乱(注意这里打乱是对当前特征集中的一个特征进行,而其它特征保持不变)。 +重复打乱 n 次,分别计算模型在打乱数据上的 MSE。 +取平均后减去基线 MSE,即得到该特征的置换重要性。 +剔除最不重要特征 + +从当前特征集中,选择置换重要性最低的那个特征(也就是打乱后对模型性能影响最小的特征)。 +将这个特征剔除,同时记录该特征的置换重要性变化。 +迭代更新 + +在新的特征集下重复步骤 2~4,不断剔除对模型贡献较小的特征。 +在每一次迭代中,都记录下当前特征组合对应的交叉验证 MSE。 +选择最佳特征组合 + +当特征数量逐步减少时,保存使得交叉验证 MSE 最低的特征组合。 +最终输出最佳的特征子集,即使得模型预测性能最优的那一组特征。 +注意几点: + +这里的“组合”指的是当前保留的特征集,并不是说对 +𝑋 +X 中的所有可能子集进行遍历(那样的组合数目将非常庞大),而是采用逐步剔除的贪心策略。 +每一步只考虑剔除一个特征,而不是一次性移除多个特征。这种方法的优势在于计算量大幅降低,但也可能不保证全局最优。 +置换重要性的计算是有重复(n_repeats)的,其目的是降低随机扰动带来的波动,从而得到一个较为稳定的特征重要性评价。 + + +### CV MSE 曲线 与 置换重要性图 + +1. CV MSE 与树的数量和特征组合的关系 + +提取日志绘制CV MSE 曲线:脚本 extract_log_CV_MSE.py + +确定随机森林树数: +在动态评估树数的过程中,CV MSE 用于观察随着树数增加,模型预测性能(均方误差)的变化,直到达到收敛(即连续两次差值低于阈值)。 + +确定最佳特征组合: +在 RFECV 的过程中,每一次迭代都计算当前特征集合下的 CV MSE。即便不同组合间的变化可能很小,整体趋势也能告诉我们哪些特征组合使得 CV MSE 达到最低值。 +如果不同特征组合下 CV MSE 的变化都很小,可能表明这些分子特征对 y 的解释力本身就不强,或者模型对部分特征缺失具有较强鲁棒性。 + +打印并绘制 CV MSE 曲线: +你可以将每轮迭代打印的结果(例如 print(f"Current number of features: {len(features)}, CV MSE: {current_score:.4f}"))记录下来,然后绘制一张图,横轴为当前保留特征的数量,纵轴为对应的 CV MSE。 + +如果在这张图中某个迭代点出现了较大的波动(例如 CV MSE 突然增大或减小),可能说明在这一轮剔除的特征对整体预测影响较大。 +如果突然下降,则可能说明删除了一个干扰特征;如果突然上升,则可能删除了一个很重要的特征,使得模型性能大幅下降。 + +2. rfecv_perm_importance.png 的意义 + +置换重要性图: +这个图记录了 RFECV 过程中每次剔除的特征对应的置换重要性(也就是:打乱该特征后的 MSE与基线 MSE 的差值)。 + +置换重要性反映的是单个特征被打乱时对模型性能的影响。 +图中每个点表示在某个迭代中被删除特征的置换重要性,数值越大(正值),说明该特征单独对模型有较大贡献;数值较小(接近 0 或负值)则说明单独贡献较低。 +两张图之间的关系: + +CV MSE 曲线图: 展示的是每个特征子集整体的预测性能变化,反映了特征组合对模型表现的影响。 +置换重要性图: 反映了在每次迭代中,被剔除的那个特征对模型性能影响的大小。 +如果在某一次迭代中,剔除某个特征导致 CV MSE 突然上升,并且那一轮的置换重要性数值明显较大,那么这可以表明该特征在组合中起到了关键作用,即使它单独的相关性不高,也可能在和其它特征交互时非常重要。 +总结来说: + +CV MSE 图主要展示整体特征组合的效果; +置换重要性图反映每次剔除操作对模型影响的即时变化。 +这两者可以互相印证:当置换重要性图中出现较大波动时(比如一个点明显高于其他点),对应的 CV MSE 曲线通常也会显示出一个拐点,提示这个特征在整个组合中的重要性。这样你既可以从整体角度看模型的稳定性,也可以从单个特征角度分析它们对模型的贡献。 \ No newline at end of file diff --git a/1d-qsar/cuda/RFE_cuml_permutation_update.log b/1d-qsar/cuda/RFE_cuml_permutation_update.log index 0636052..4d854fa 100644 --- a/1d-qsar/cuda/RFE_cuml_permutation_update.log +++ b/1d-qsar/cuda/RFE_cuml_permutation_update.log @@ -36,3 +36,298 @@ Current number of features: 147, CV MSE: 4.7937 Removed feature index: 12, feature name: MaxAbsPartialCharge, permutation importance: -0.0008 Current number of features: 146, CV MSE: 4.7589 Removed feature index: 2, feature name: MinAbsEStateIndex, permutation importance: -0.0007 +Current number of features: 145, CV MSE: 4.8444 +Removed feature index: 125, feature name: fr_Ar_OH, permutation importance: -0.0000 +Current number of features: 144, CV MSE: 4.8518 +Removed feature index: 119, feature name: RingCount, permutation importance: -0.0019 +Current number of features: 143, CV MSE: 4.6953 +Removed feature index: 67, feature name: SMR_VSA9, permutation importance: -0.0000 +Current number of features: 142, CV MSE: 4.7689 +Removed feature index: 77, feature name: SlogP_VSA8, permutation importance: -0.0001 +Current number of features: 141, CV MSE: 4.7446 +Removed feature index: 55, feature name: PEOE_VSA5, permutation importance: -0.0001 +Current number of features: 140, CV MSE: 4.8110 +Removed feature index: 66, feature name: SMR_VSA7, permutation importance: -0.0011 +Current number of features: 139, CV MSE: 4.7815 +Removed feature index: 0, feature name: MaxAbsEStateIndex, permutation importance: 0.0000 +Current number of features: 138, CV MSE: 4.8477 +Removed feature index: 148, feature name: fr_para_hydroxylation, permutation importance: -0.0003 +Current number of features: 137, CV MSE: 4.7395 +Removed feature index: 48, feature name: PEOE_VSA11, permutation importance: -0.0000 +Current number of features: 136, CV MSE: 4.6840 +Removed feature index: 150, feature name: fr_phenol_noOrthoHbond, permutation importance: -0.0000 +Current number of features: 135, CV MSE: 4.8117 +Removed feature index: 60, feature name: SMR_VSA1, permutation importance: -0.0001 +Current number of features: 134, CV MSE: 4.7988 +Removed feature index: 149, feature name: fr_phenol, permutation importance: -0.0000 +Current number of features: 133, CV MSE: 4.7597 +Removed feature index: 76, feature name: SlogP_VSA6, permutation importance: -0.0000 +Current number of features: 132, CV MSE: 4.7059 +Removed feature index: 69, feature name: SlogP_VSA10, permutation importance: -0.0000 +Current number of features: 131, CV MSE: 4.7498 +Removed feature index: 51, feature name: PEOE_VSA14, permutation importance: -0.0000 +Current number of features: 130, CV MSE: 4.7434 +Removed feature index: 142, feature name: fr_imidazole, permutation importance: -0.0000 +Current number of features: 129, CV MSE: 4.7211 +Removed feature index: 105, feature name: NumAmideBonds, permutation importance: -0.0001 +Current number of features: 128, CV MSE: 4.8107 +Removed feature index: 53, feature name: PEOE_VSA3, permutation importance: -0.0000 +Current number of features: 127, CV MSE: 4.7079 +Removed feature index: 86, feature name: EState_VSA7, permutation importance: -0.0009 +Current number of features: 126, CV MSE: 4.7144 +Removed feature index: 136, feature name: fr_aryl_methyl, permutation importance: -0.0000 +Current number of features: 125, CV MSE: 4.7590 +Removed feature index: 113, feature name: NumHeterocycles, permutation importance: -0.0013 +Current number of features: 124, CV MSE: 4.8185 +Removed feature index: 146, feature name: fr_nitro_arom, permutation importance: -0.0000 +Current number of features: 123, CV MSE: 4.7051 +Removed feature index: 153, feature name: fr_thiazole, permutation importance: -0.0000 +Current number of features: 122, CV MSE: 4.6048 +Removed feature index: 138, feature name: fr_bicyclic, permutation importance: -0.0006 +Current number of features: 121, CV MSE: 4.5805 +Removed feature index: 147, feature name: fr_nitro_arom_nonortho, permutation importance: -0.0000 +Current number of features: 120, CV MSE: 4.8172 +Removed feature index: 62, feature name: SMR_VSA3, permutation importance: -0.0001 +Current number of features: 119, CV MSE: 4.8393 +Removed feature index: 127, feature name: fr_C_O_noCOO, permutation importance: -0.0002 +Current number of features: 118, CV MSE: 4.7780 +Removed feature index: 1, feature name: MaxEStateIndex, permutation importance: 0.0000 +Current number of features: 117, CV MSE: 4.7227 +Removed feature index: 3, feature name: MinEStateIndex, permutation importance: 0.0000 +Current number of features: 116, CV MSE: 4.7864 +Removed feature index: 4, feature name: qed, permutation importance: 0.0000 +Current number of features: 115, CV MSE: 4.6388 +Removed feature index: 5, feature name: SPS, permutation importance: 0.0000 +Current number of features: 114, CV MSE: 4.5150 +Removed feature index: 140, feature name: fr_furan, permutation importance: -0.0001 +Current number of features: 113, CV MSE: 4.6324 +Removed feature index: 6, feature name: MolWt, permutation importance: 0.0000 +Current number of features: 112, CV MSE: 4.7494 +Removed feature index: 24, feature name: BCUT2D_MRLOW, permutation importance: -0.0009 +Current number of features: 111, CV MSE: 4.6774 +Removed feature index: 80, feature name: EState_VSA10, permutation importance: -0.0001 +Current number of features: 110, CV MSE: 4.7463 +Removed feature index: 70, feature name: SlogP_VSA11, permutation importance: -0.0000 +Current number of features: 109, CV MSE: 4.7211 +Removed feature index: 54, feature name: PEOE_VSA4, permutation importance: -0.0000 +Current number of features: 108, CV MSE: 4.6060 +Removed feature index: 7, feature name: HeavyAtomMolWt, permutation importance: 0.0000 +Current number of features: 107, CV MSE: 4.5989 +Removed feature index: 88, feature name: EState_VSA9, permutation importance: -0.0001 +Current number of features: 106, CV MSE: 4.5530 +Removed feature index: 106, feature name: NumAromaticCarbocycles, permutation importance: -0.0008 +Current number of features: 105, CV MSE: 4.6703 +Removed feature index: 8, feature name: ExactMolWt, permutation importance: 0.0000 +Current number of features: 104, CV MSE: 4.4274 +Removed feature index: 145, feature name: fr_nitro, permutation importance: -0.0000 +Current number of features: 103, CV MSE: 4.7372 +Removed feature index: 9, feature name: NumValenceElectrons, permutation importance: 0.0000 +Current number of features: 102, CV MSE: 4.6272 +Removed feature index: 129, feature name: fr_NH1, permutation importance: -0.0004 +Current number of features: 101, CV MSE: 4.6820 +Removed feature index: 10, feature name: MaxPartialCharge, permutation importance: 0.0000 +Current number of features: 100, CV MSE: 4.5780 +Removed feature index: 13, feature name: MinAbsPartialCharge, permutation importance: -0.0000 +Current number of features: 99, CV MSE: 4.6734 +Removed feature index: 139, feature name: fr_ether, permutation importance: -0.0002 +Current number of features: 98, CV MSE: 4.6922 +Removed feature index: 50, feature name: PEOE_VSA13, permutation importance: -0.0000 +Current number of features: 97, CV MSE: 4.6520 +Removed feature index: 134, feature name: fr_amide, permutation importance: -0.0002 +Current number of features: 96, CV MSE: 4.6574 +Removed feature index: 14, feature name: FpDensityMorgan1, permutation importance: 0.0000 +Current number of features: 95, CV MSE: 4.7120 +Removed feature index: 15, feature name: FpDensityMorgan2, permutation importance: 0.0000 +Current number of features: 94, CV MSE: 4.6882 +Removed feature index: 16, feature name: FpDensityMorgan3, permutation importance: 0.0000 +Current number of features: 93, CV MSE: 4.5839 +Removed feature index: 17, feature name: BCUT2D_MWHI, permutation importance: 0.0000 +Current number of features: 92, CV MSE: 4.6231 +Removed feature index: 18, feature name: BCUT2D_MWLOW, permutation importance: 0.0000 +Current number of features: 91, CV MSE: 4.6144 +Removed feature index: 154, feature name: fr_thiophene, permutation importance: 0.0000 +Current number of features: 90, CV MSE: 4.4886 +Removed feature index: 19, feature name: BCUT2D_CHGHI, permutation importance: 0.0000 +Current number of features: 89, CV MSE: 4.5828 +Removed feature index: 20, feature name: BCUT2D_CHGLO, permutation importance: 0.0000 +Current number of features: 88, CV MSE: 4.4967 +Removed feature index: 144, feature name: fr_methoxy, permutation importance: -0.0003 +Current number of features: 87, CV MSE: 4.5881 +Removed feature index: 21, feature name: BCUT2D_LOGPHI, permutation importance: 0.0000 +Current number of features: 86, CV MSE: 4.4972 +Removed feature index: 22, feature name: BCUT2D_LOGPLOW, permutation importance: 0.0000 +Current number of features: 85, CV MSE: 4.5007 +Removed feature index: 137, feature name: fr_benzene, permutation importance: -0.0010 +Current number of features: 84, CV MSE: 4.5097 +Removed feature index: 23, feature name: BCUT2D_MRHI, permutation importance: 0.0000 +Current number of features: 83, CV MSE: 4.4847 +Removed feature index: 25, feature name: AvgIpc, permutation importance: 0.0000 +Current number of features: 82, CV MSE: 4.6722 +Removed feature index: 26, feature name: BalabanJ, permutation importance: 0.0000 +Current number of features: 81, CV MSE: 4.5776 +Removed feature index: 27, feature name: BertzCT, permutation importance: 0.0000 +Current number of features: 80, CV MSE: 4.5790 +Removed feature index: 28, feature name: Chi0, permutation importance: 0.0000 +Current number of features: 79, CV MSE: 4.5349 +Removed feature index: 29, feature name: Chi0n, permutation importance: 0.0000 +Current number of features: 78, CV MSE: 4.5542 +Removed feature index: 30, feature name: Chi0v, permutation importance: 0.0000 +Current number of features: 77, CV MSE: 4.5299 +Removed feature index: 31, feature name: Chi1, permutation importance: 0.0000 +Current number of features: 76, CV MSE: 4.6087 +Removed feature index: 32, feature name: Chi1n, permutation importance: 0.0000 +Current number of features: 75, CV MSE: 4.6009 +Removed feature index: 33, feature name: Chi1v, permutation importance: 0.0000 +Current number of features: 74, CV MSE: 4.4684 +Removed feature index: 34, feature name: Chi2n, permutation importance: 0.0000 +Current number of features: 73, CV MSE: 4.5094 +Removed feature index: 35, feature name: Chi2v, permutation importance: 0.0000 +Current number of features: 72, CV MSE: 4.4700 +Removed feature index: 49, feature name: PEOE_VSA12, permutation importance: -0.0001 +Current number of features: 71, CV MSE: 4.5176 +Removed feature index: 36, feature name: Chi3n, permutation importance: 0.0000 +Current number of features: 70, CV MSE: 4.4335 +Removed feature index: 37, feature name: Chi3v, permutation importance: -0.0000 +Current number of features: 69, CV MSE: 4.4646 +Removed feature index: 38, feature name: Chi4n, permutation importance: 0.0000 +Current number of features: 68, CV MSE: 4.4792 +Removed feature index: 39, feature name: Chi4v, permutation importance: -0.0000 +Current number of features: 67, CV MSE: 4.4398 +Removed feature index: 40, feature name: HallKierAlpha, permutation importance: 0.0000 +Current number of features: 66, CV MSE: 4.3681 +Removed feature index: 41, feature name: Ipc, permutation importance: 0.0001 +Current number of features: 65, CV MSE: 4.3967 +Removed feature index: 42, feature name: Kappa1, permutation importance: 0.0000 +Current number of features: 64, CV MSE: 4.4340 +Removed feature index: 43, feature name: Kappa2, permutation importance: 0.0000 +Current number of features: 63, CV MSE: 4.4324 +Removed feature index: 44, feature name: Kappa3, permutation importance: 0.0000 +Current number of features: 62, CV MSE: 4.5365 +Removed feature index: 45, feature name: LabuteASA, permutation importance: 0.0000 +Current number of features: 61, CV MSE: 4.5531 +Removed feature index: 46, feature name: PEOE_VSA1, permutation importance: 0.0000 +Current number of features: 60, CV MSE: 4.4241 +Removed feature index: 56, feature name: PEOE_VSA6, permutation importance: 0.0000 +Current number of features: 59, CV MSE: 4.4180 +Removed feature index: 57, feature name: PEOE_VSA7, permutation importance: -0.0000 +Current number of features: 58, CV MSE: 4.3972 +Removed feature index: 58, feature name: PEOE_VSA8, permutation importance: 0.0000 +Current number of features: 57, CV MSE: 4.2306 +Removed feature index: 59, feature name: PEOE_VSA9, permutation importance: 0.0000 +Current number of features: 56, CV MSE: 4.2263 +Removed feature index: 61, feature name: SMR_VSA10, permutation importance: 0.0000 +Current number of features: 55, CV MSE: 4.4408 +Removed feature index: 63, feature name: SMR_VSA4, permutation importance: 0.0002 +Current number of features: 54, CV MSE: 4.4805 +Removed feature index: 64, feature name: SMR_VSA5, permutation importance: 0.0003 +Current number of features: 53, CV MSE: 4.4570 +Removed feature index: 65, feature name: SMR_VSA6, permutation importance: 0.0000 +Current number of features: 52, CV MSE: 4.4523 +Removed feature index: 68, feature name: SlogP_VSA1, permutation importance: -0.0000 +Current number of features: 51, CV MSE: 4.4382 +Removed feature index: 71, feature name: SlogP_VSA12, permutation importance: 0.0000 +Current number of features: 50, CV MSE: 4.4065 +Removed feature index: 72, feature name: SlogP_VSA2, permutation importance: 0.0000 +Current number of features: 49, CV MSE: 4.3169 +Removed feature index: 73, feature name: SlogP_VSA3, permutation importance: 0.0000 +Current number of features: 48, CV MSE: 4.2661 +Removed feature index: 74, feature name: SlogP_VSA4, permutation importance: 0.0000 +Current number of features: 47, CV MSE: 4.3875 +Removed feature index: 75, feature name: SlogP_VSA5, permutation importance: 0.0004 +Current number of features: 46, CV MSE: 4.3418 +Removed feature index: 78, feature name: TPSA, permutation importance: -0.0000 +Current number of features: 45, CV MSE: 4.4599 +Removed feature index: 79, feature name: EState_VSA1, permutation importance: 0.0000 +Current number of features: 44, CV MSE: 4.5502 +Removed feature index: 81, feature name: EState_VSA2, permutation importance: 0.0000 +Current number of features: 43, CV MSE: 4.4299 +Removed feature index: 135, feature name: fr_aniline, permutation importance: 0.0025 +Current number of features: 42, CV MSE: 4.4397 +Removed feature index: 82, feature name: EState_VSA3, permutation importance: 0.0000 +Current number of features: 41, CV MSE: 4.4255 +Removed feature index: 83, feature name: EState_VSA4, permutation importance: 0.0003 +Current number of features: 40, CV MSE: 4.4042 +Removed feature index: 84, feature name: EState_VSA5, permutation importance: 0.0000 +Current number of features: 39, CV MSE: 4.4017 +Removed feature index: 85, feature name: EState_VSA6, permutation importance: 0.0000 +Current number of features: 38, CV MSE: 4.3463 +Removed feature index: 87, feature name: EState_VSA8, permutation importance: 0.0003 +Current number of features: 37, CV MSE: 4.5079 +Removed feature index: 89, feature name: VSA_EState1, permutation importance: -0.0001 +Current number of features: 36, CV MSE: 4.4404 +Removed feature index: 90, feature name: VSA_EState10, permutation importance: 0.0002 +Current number of features: 35, CV MSE: 4.4802 +Removed feature index: 91, feature name: VSA_EState2, permutation importance: -0.0000 +Current number of features: 34, CV MSE: 4.5477 +Removed feature index: 92, feature name: VSA_EState3, permutation importance: 0.0000 +Current number of features: 33, CV MSE: 4.5129 +Removed feature index: 93, feature name: VSA_EState4, permutation importance: 0.0000 +Current number of features: 32, CV MSE: 4.4643 +Removed feature index: 94, feature name: VSA_EState5, permutation importance: 0.0004 +Current number of features: 31, CV MSE: 4.6173 +Removed feature index: 95, feature name: VSA_EState6, permutation importance: 0.0000 +Current number of features: 30, CV MSE: 4.5979 +Removed feature index: 96, feature name: VSA_EState7, permutation importance: 0.0002 +Current number of features: 29, CV MSE: 4.4120 +Removed feature index: 97, feature name: VSA_EState8, permutation importance: 0.0000 +Current number of features: 28, CV MSE: 4.4958 +Removed feature index: 98, feature name: VSA_EState9, permutation importance: 0.0004 +Current number of features: 27, CV MSE: 4.5448 +Removed feature index: 99, feature name: FractionCSP3, permutation importance: 0.0002 +Current number of features: 26, CV MSE: 4.3483 +Removed feature index: 100, feature name: HeavyAtomCount, permutation importance: 0.0014 +Current number of features: 25, CV MSE: 4.3617 +Removed feature index: 101, feature name: NHOHCount, permutation importance: 0.0001 +Current number of features: 24, CV MSE: 4.4947 +Removed feature index: 102, feature name: NOCount, permutation importance: 0.0017 +Current number of features: 23, CV MSE: 4.2328 +Removed feature index: 103, feature name: NumAliphaticHeterocycles, permutation importance: 0.0022 +Current number of features: 22, CV MSE: 4.3158 +Removed feature index: 104, feature name: NumAliphaticRings, permutation importance: 0.0000 +Current number of features: 21, CV MSE: 4.3848 +Removed feature index: 108, feature name: NumAromaticRings, permutation importance: 0.0000 +Current number of features: 20, CV MSE: 4.3861 +Removed feature index: 109, feature name: NumAtomStereoCenters, permutation importance: 0.0009 +Current number of features: 19, CV MSE: 4.1140 +Removed feature index: 110, feature name: NumHAcceptors, permutation importance: 0.0002 +Current number of features: 18, CV MSE: 4.0750 +Removed feature index: 111, feature name: NumHDonors, permutation importance: 0.0000 +Current number of features: 17, CV MSE: 3.9148 +Removed feature index: 112, feature name: NumHeteroatoms, permutation importance: 0.0004 +Current number of features: 16, CV MSE: 4.0441 +Removed feature index: 114, feature name: NumRotatableBonds, permutation importance: 0.0016 +Current number of features: 15, CV MSE: 3.9886 +Removed feature index: 115, feature name: NumSaturatedHeterocycles, permutation importance: 0.0000 +Current number of features: 14, CV MSE: 4.1169 +Removed feature index: 116, feature name: NumSaturatedRings, permutation importance: 0.0000 +Current number of features: 13, CV MSE: 4.1846 +Removed feature index: 117, feature name: NumUnspecifiedAtomStereoCenters, permutation importance: 0.0000 +Current number of features: 12, CV MSE: 4.2129 +Removed feature index: 118, feature name: Phi, permutation importance: 0.0020 +Current number of features: 11, CV MSE: 4.2006 +Removed feature index: 120, feature name: MolLogP, permutation importance: 0.0024 +Current number of features: 10, CV MSE: 3.9221 +Removed feature index: 121, feature name: MolMR, permutation importance: 0.0032 +Current number of features: 9, CV MSE: 3.9000 +Removed feature index: 122, feature name: fr_Al_OH, permutation importance: 0.0015 +Current number of features: 8, CV MSE: 4.1202 +Removed feature index: 123, feature name: fr_Al_OH_noTert, permutation importance: 0.0020 +Current number of features: 7, CV MSE: 4.0491 +Removed feature index: 126, feature name: fr_C_O, permutation importance: 0.0002 +Current number of features: 6, CV MSE: 3.8452 +Removed feature index: 128, feature name: fr_NH0, permutation importance: 0.0003 +Current number of features: 5, CV MSE: 3.8135 +Removed feature index: 131, feature name: fr_aldehyde, permutation importance: 0.0013 +Current number of features: 4, CV MSE: 3.8181 +Removed feature index: 130, feature name: fr_Ndealkylation2, permutation importance: 0.0106 +Current number of features: 3, CV MSE: 3.9118 +Removed feature index: 132, feature name: fr_alkyl_halide, permutation importance: -0.0000 +Current number of features: 2, CV MSE: 3.9524 +Removed feature index: 141, feature name: fr_halogen, permutation importance: 0.0000 +Current number of features: 1, CV MSE: 3.9524 + +Manual RFECV selected 5 features, CV MSE: 3.8135 +Selected feature names: ['fr_Ndealkylation2', 'fr_aldehyde', 'fr_alkyl_halide', 'fr_halogen', 'fr_piperdine'] +Permutation Importance Dictionary: {'fr_aniline': np.float64(0.00246189091323068), 'HeavyAtomCount': np.float64(0.0014426564014682342), 'NOCount': np.float64(0.0016944533320952804), 'NumAliphaticHeterocycles': np.float64(0.0021747410248889043), 'NumRotatableBonds': np.float64(0.0015911528501808547), 'Phi': np.float64(0.002030119145237652), 'MolLogP': np.float64(0.002402469195120438), 'MolMR': np.float64(0.003153584746324878), 'fr_Al_OH': np.float64(0.0015341504672248263), 'fr_Al_OH_noTert': np.float64(0.0020245992143588243), 'fr_aldehyde': np.float64(0.0013244453267742262), 'fr_Ndealkylation2': np.float64(0.010645430860510174)} + +Final model (selected features, n_estimators=350) CV MSE: 3.8135 diff --git a/1d-qsar/cuda/RFE_cuml_permutation_update.py b/1d-qsar/cuda/RFE_cuml_permutation_update.py index 29c6f0b..36f6f33 100644 --- a/1d-qsar/cuda/RFE_cuml_permutation_update.py +++ b/1d-qsar/cuda/RFE_cuml_permutation_update.py @@ -217,14 +217,32 @@ plt.title('Permutation Importance during RFECV') plt.grid(True) # Annotation threshold: annotate points where absolute permutation importance > 0.002 -annotation_threshold = 0.002 +annotation_threshold = 0.001 for i, imp_val in enumerate(perm_imp_history): - if abs(imp_val) > annotation_threshold: - plt.annotate(removed_feature_names[i], (iterations[i], imp_val), textcoords="offset points", xytext=(0,5), ha='center') + if imp_val > annotation_threshold: + plt.annotate(removed_feature_names[i], + (iterations[i], imp_val), + textcoords="offset points", + xytext=(0,5), + ha='center') plt.savefig("rfecv_perm_importance.png", dpi=300) plt.close() +# --------------------------- +# 保存置换重要性数据到字典并序列化为 pkl 文件 +# --------------------------- +# 构建字典,只保存置换重要性大于 0.001 的分子属性 +perm_importance_dict = {name: imp for name, imp in zip(removed_feature_names, perm_imp_history) if imp > annotation_threshold} + +# 将字典保存到 pickle 文件中 +import pickle +with open("perm_importance_dict.pkl", "wb") as f: + pickle.dump(perm_importance_dict, f) + +# 打印字典 +print("Permutation Importance Dictionary:", perm_importance_dict) + # --------------------------- # Final model training: train final model using best feature subset and optimal number of trees # --------------------------- diff --git a/1d-qsar/cuda/extract_log_CV_MSE.py b/1d-qsar/cuda/extract_log_CV_MSE.py new file mode 100644 index 0000000..d094284 --- /dev/null +++ b/1d-qsar/cuda/extract_log_CV_MSE.py @@ -0,0 +1,49 @@ +import re +import matplotlib.pyplot as plt + +def extract_cv_mse(log_path): + """ + 从日志文件中提取特征数和对应的CV MSE。 + 日志行格式示例: + "Current number of features: 156, CV MSE: 4.9253" + """ + feature_counts = [] + cv_mses = [] + pattern = re.compile(r"Current number of features:\s*(\d+),\s*CV MSE:\s*([\d\.]+)") + with open(log_path, "r") as f: + for line in f: + match = pattern.search(line) + if match: + feature_counts.append(int(match.group(1))) + cv_mses.append(float(match.group(2))) + return feature_counts, cv_mses + +def plot_cv_mse(log_path, output_image="cv_mse_curve.png"): + """ + 根据日志文件中的数据绘制 CV MSE 曲线,并保存图片 + """ + feature_counts, cv_mses = extract_cv_mse(log_path) + + if not feature_counts: + print("没有在日志文件中找到有效的 'Current number of features' 数据。") + return + + plt.figure(figsize=(8,6)) + plt.plot(feature_counts, cv_mses, marker='o', linestyle='-') + plt.xlabel("Number of Features") + plt.ylabel("CV MSE") + plt.title("CV MSE vs. Number of Features") + plt.grid(True) + plt.savefig(output_image, dpi=300) + plt.close() + print(f"CV MSE 曲线图已保存为: {output_image}") + +if __name__ == "__main__": + import sys + if len(sys.argv) < 2: + print("用法: python script.py ") + else: + log_file = sys.argv[1] + plot_cv_mse(log_file) + +# python 1d-qsar/cuda/RFE_cuml_permutation_update.log