添加注释

2024-12-11 21:25:20 +08:00
parent e8b93c7ac9
commit 4426e7ca88
4 changed files with 74 additions and 41 deletions
--- a/5.0M.py
+++ b/5.0M.py
@@ -7,7 +7,7 @@ from pathlib import Path

 def trainning(): 
    length = 30
-    fold = 10
+    fold = 10 # 交叉验证的折数
    if not Path('POS.txt').exists():
        raise FileNotFoundError("POS.txt or NEG.txt not found.")
    f1 = open(r"POS.txt", "r")
@@ -32,7 +32,7 @@ def trainning():
        total = 0.0
        for j in range(len(l_scores[i])):
            total += l_scores[i][j]
-        raw_scores.append(total) # 展平所有打分
+        raw_scores.append(total) # 展平所有打分 可以直接用于排序、筛选或者特征归一化
    X = np.array(l_scores) # 所有评分
    Y = np.array(l_type) # 所有正负列表的标签
    PEP = np.array(l_peps) # 原始丙酰化肽段的原始字符串
@@ -96,53 +96,68 @@ def newAAScore(AAscores, l_aas, AAs, MM_coef):


 def getWeightScoreType(pos, neg, matrix, AAs,length):
-    scores = [] # scores 是一个二维列表，用于存储每个位置上的氨基酸得分。
+    '''
+    这个函数计算每个肽段的得分向量，其中每个位置的得分是基于 BLOSUM62 矩阵和位置权重。
+    获取位置权重确定 PWD 矩阵
+    参数解释：
+    pos: 正样本长度61氨基酸的set集合
+    neg: 负样本长度61氨基酸的set集合
+    matrix: 62*62的矩阵，用于计算每个氨基酸对位置的权重，即AAscores
+    AAs: 24个氨基酸的列表，用于索引矩阵
+    length: 肽段的长度，用于确定打分矩阵的行数，即打分矩阵的行数等于肽段的长度*2+1
+    return: l_scores, l_type,l_peps
+    l_scores: 二维列表，存储每个肽段的打分向量，打分向量是长度为61的列表，每个位置上的打分是该位置上所有氨基酸的打分之和。
+    l_type: 二维列表，存储肽段的标签，1表示正样本，0表示负样本。
+    l_peps: 二维列表，存储肽段的原始字符串。
+    '''
+    scores = [] # scores 是一个二维列表，用于存储正样本每个位置上的氨基酸得分矩阵。
    for i in range(length*2+1): # 61 = 30+1+30
        pos_score = []
-        for j in range(len(AAs)):
-            aa1 = AAs[j]
-            score = 0.0
-            for oth in pos:
-                aa2 = oth[i:i + 1]
-                aas = aa1 + "_" + aa2
-                aas2 = aa2 + "_" + aa1
+        for j in range(len(AAs)):  # 遍历每种氨基酸
+            aa1 = AAs[j]  # 当前氨基酸
+            score = 0.0   # 初始化得分为 0
+            for oth in pos:  # 遍历所有正样本
+                aa2 = oth[i:i + 1]  # 正样本中当前位置的氨基酸
+                aas = aa1 + "_" + aa2  # 生成氨基酸对的键
+                aas2 = aa2 + "_" + aa1  # 反向氨基酸对的键
                if aas in matrix:
-                    score += matrix[aas]
+                    score += matrix[aas]  # 替代矩阵中的相似性得分
                else:
                    score += matrix[aas2]
-            pos_score.append(score)
+            pos_score.append(score)  # 将得分加入当前位置的氨基酸列表
        scores.append(pos_score)

-    l_scores = [] # l_scores 是一个二维列表，用于存储每个肽段的得分向量。
+    l_scores = [] # l_scores 是一个二维列表，用于存储每个肽段的得分向量。对每个肽段，最终形成一个长度为 61 的向量.
    l_type = []
    l_peps = []

    for pep in pos:
        score = []
-        for i in range(len(pep)): # range(0,61)
-            aa = pep[i:i + 1]
-            index = AAs.index(aa)
-            aascore = (scores[i][index] - matrix[aa + "_" + aa]) / (len(pos) - 1) # 减去自身样本的影响，使用样本均值
-            score.append(aascore)
-        l_scores.append(score)
-        l_type.append(1)
-        l_peps.append(pep)
+        for i in range(len(pep)):  # 遍历肽段的每个位置
+            aa = pep[i:i + 1]  # 当前肽段在位置 i 的氨基酸
+            index = AAs.index(aa)  # 获取氨基酸在 AAs 列表中的索引
+            aascore = (scores[i][index] - matrix[aa + "_" + aa]) / (len(pos) - 1)  # 计算平均得分
+            score.append(aascore)  # 加入当前肽段的位置得分
+        l_scores.append(score)  # 将该肽段的打分向量加入到结果列表
+        l_type.append(1)  # 标签为 1，表示正样本
+        l_peps.append(pep)  # 保存肽段的原始字符串

    # num = 0
    for pep in neg:
        score = []
        for i in range(len(pep)):
-            aa = pep[i:i + 1]
-            index = AAs.index(aa)
-            aascore = scores[i][index] / len(pos) # 负样本本身就是噪音并不需要调整自身影响。
-            score.append(aascore)
-        l_scores.append(score)
-        l_type.append(0)
-        l_peps.append(pep)
+            aa = pep[i:i + 1]  # 当前肽段在位置 i 的氨基酸
+            index = AAs.index(aa)  # 获取氨基酸在 AAs 列表中的索引
+            aascore = scores[i][index] / len(pos)  # 直接取正样本的平均得分 # 负样本本身就是噪音并不需要调整自身影响。
+            score.append(aascore)  # 加入当前肽段的位置得分
+        l_scores.append(score)  # 将该肽段的打分向量加入到结果列表
+        l_type.append(0)  # 标签为 0，表示负样本
+        l_peps.append(pep)  # 保存肽段的原始字符串

-    return l_scores, l_type,l_peps
+    return l_scores, l_type,l_peps # 返回含有正负样本的打分向量，打分向量是长度为61的列表，每个位置上的打分是该位置上所有氨基酸的打分之和。

 def getMMScoreType(pos, neg, matrix, weights, l_aas, AAs,length):
+    # 这个函数计算每个肽段的得分向量，其中每个位置的得分是基于氨基酸对的评分矩阵和位置权重。SMO 矩阵权重
    """
    参数解释：
    
@@ -188,16 +203,16 @@ def getMMScoreType(pos, neg, matrix, weights, l_aas, AAs,length):
        score_neg = []
        for j in range(len(AAs)):
            aa1 = AAs[j]
-            score = []
-            for z in range(len(l_aas)):
-                score.append(0.0)
+            score = [] # 是一个长度为 300 的列表，用来保存每个氨基酸对在当前位置的得分。
+            for z in range(len(l_aas)): # 遍历所有氨基酸对 (a, b)
+                score.append(0.0) # 初始化每个氨基酸对的得分
            for oth in pos:
                aa2 = oth[i:i + 1]
                aas1 = aa1 + "_" + aa2
                aas2 = aa2 + "_" + aa1
                if aas1 in l_aas:
                    index = l_aas.index(aas1)
-                    score[index] += matrix[aas1] * weights[i]
+                    score[index] += matrix[aas1] * weights[i] # 计算 (a, b) 对应位置 j 的得分
                elif aas2 in l_aas:
                    index = l_aas.index(aas2)
                    score[index] += matrix[aas2] * weights[i]
@@ -360,6 +375,8 @@ def logistic_GPS(X: bytearray, Y: bytearray,PEP,type,turn):
    肽段数据，形状为 (n_samples,)，表示每个样本的肽段序列。虽然在 logistic_GPS 函数中没有直接使用 PEP，但它可能在后续的处理中被用到，例如在 writeParameter_WW 或 writeParameter_MM 函数中。
    主要功能是执行逻辑回归训练，特别是交叉验证选择最佳正则化参数 C，并根据此参数训练逻辑回归模型，返回权重系数和 AUC 分数。

+    turn: 训练轮次，用于动态调整正则化参数 C 的值。
+
    return
    list_coef: list of float
    权重系数，形状为 (n_features,)，表示每个特征的权重。用于加权评分。
@@ -378,18 +395,26 @@ def logistic_GPS(X: bytearray, Y: bytearray,PEP,type,turn):
    重点在于结合氨基酸对的评分矩阵和位置权重，进一步优化模型的性能。
    通常用于后续的训练阶段，通过矩阵乘法进一步调整权重，提高模型的准确性。
    '''
-    solverchose = 'sag'
+    solverchose = 'sag' # 选择求解器
    clscv = LogisticRegressionCV(max_iter=10000, cv=10, solver=solverchose,scoring='roc_auc') # LogisticRegressionCV 的目标是通过最大化AUC交叉验证找到最佳的 C 值
-    clscv.fit(X, Y)
+    '''
+    max_iter=10000: 设置最大迭代次数。
+    cv=10: 使用 10 折交叉验证。
+    solver='sag': 使用随机平均梯度下降求解器。
+    scoring='roc_auc': 使用 ROC AUC 作为评分标准。
+    '''
+    clscv.fit(X, Y) # PWD 权重优化：通过逻辑回归计算每个位置的权重 Wj
    #regularization = clscv.C_[0]
    regularization = clscv.C_[0] * (10**(-turn)) # 动态调整C的值，通过10^turn 来调整正则化的强度。
    print("C=" + str(regularization))
+    # 使用最佳正则化参数训练逻辑回归模型
    cls = LogisticRegression(max_iter=10000,solver=solverchose,C=regularization)
    cls.fit(X, Y)
-    list_coef = cls.coef_[0]
+    # 获取权重系数和 AUC 分数:
+    list_coef = cls.coef_[0] # 权重Wj，反映每个位置在区分正负样本时的重要性，用于后续加权特征计算。
    predict_prob_x = cls.predict_proba(X)
    predict_x = predict_prob_x[:, 1]
-    auc = roc_auc_score(Y,np.array(predict_x))
+    auc = roc_auc_score(Y,np.array(predict_x)) # 通过 ROC 曲线评估模型性能，对应论文中的交叉验证 AUC 分数计算。
    print("AUC:" + str(auc))
    return list_coef,auc

--- a/MAML.py
+++ b/MAML.py
@@ -15,8 +15,8 @@ def fetshot():

    length = 30
    nfold = 5
-    f1 = open(r"functionsite.txt", "r")
-    f2 = open(r"POS.txt", "r")
+    f1 = open(r"demo/functionsite", "r")
+    f2 = open(r"demo/POS.txt", "r")
    funcinf = set()
    for line in f1.readlines():
        site = line.strip()
@@ -45,7 +45,7 @@ def fetshot():
            print(len(neg))

            AAscores, l_aas, weight_coef, AAs = \
-                WeightAndMatrix("traningout_best.txt")
+                WeightAndMatrix("demo/traningout_best.txt")
            l_scores, l_type, peps = getMMScoreType(pos, new_neg, AAscores, weight_coef, l_aas, AAs, length)
            raw_scores = []
            for i in range(len(l_scores)):
--- a/README.md
+++ b/README.md
@@ -35,5 +35,10 @@ Dr. Luoying Zhang: zhangluoying@hust.edu.cn
 Chenwei Wang: wangchenwei@hust.edu.cn  
 Ke Shui: shuike@hust.edu.cn  

+## env
+
+```shell
+conda install -c conda-forge -y keras==2.4.3 numpy scikit-learn pandas
+```


--- a/Tools.py
+++ b/Tools.py
@@ -2,11 +2,14 @@ import numpy as np
 def blosum62():
    f1 = open("BLOSUM62","r")
    l_AAS = []
-    AAs = []
+    AAs = [] # 取BLOSUM62第一列
    scores = {}
    for line in f1.readlines():
+        print('line:',line)
        sp = line.split()
+        print('sp:',sp)
        aa = sp[0]
+        print('aa:',aa)
        AAs.append(aa)
    num = 0
    f1 = open("BLOSUM62","r")