diff --git a/README.md b/README.md index e914ad7..7bf3e17 100755 --- a/README.md +++ b/README.md @@ -288,5 +288,7 @@ RNA可以用Amber的Parmbsc1的力场,是amber中最新的适合模拟核酸 单纯的常规MD就用水中溶菌酶那个教程就行。力场用我上面提到的就行。 不过你模拟两个碱基的RNA有什么意义吗?是要模拟碱基配对过程(比如不同碱基之间的配对,这个貌似有人研究过了)吗还是怎么样。 -你可以看下这篇文章,今晚才看见的, -Assessment of AMBER Force Fields for Simulations of ssDNA(DOI: 10.1021/acs.jctc.0c0093) \ No newline at end of file +你可以看下这篇文章 + +Assessment of AMBER Force Fields for Simulations of ssDNA(DOI: 10.1021/acs.jctc.0c0093) + diff --git a/README_QMMM.md b/README_QMMM.md new file mode 100644 index 0000000..4c7b753 --- /dev/null +++ b/README_QMMM.md @@ -0,0 +1 @@ +# [ASH - A QM/MM packages](https://mp.weixin.qq.com/s/jjVe81BwBjUWeNSEGymsAw) \ No newline at end of file diff --git a/bioinfo/MHH.py b/bioinfo/MHH.py new file mode 100644 index 0000000..e94fdf0 --- /dev/null +++ b/bioinfo/MHH.py @@ -0,0 +1,73 @@ +import numpy as np +from hmmlearn import hmm + +# 定义状态和观察集合 +# 状态集合代表不同的基因型: MM, MZ, ZZ +states = ["MM", "MZ", "ZZ"] +n_states = len(states) + +# 观测集合为 0 和 1,其中 0 表示 ZZ,1 表示 MM +observations = [0, 1] +n_observations = len(observations) + +# 初始概率分布 (initial state probabilities) +# 定义每个状态的初始概率: MM: 0.4, MZ: 0.2, ZZ: 0.4 +start_probability = np.array([0.4, 0.2, 0.4]) + +# 转移概率矩阵 (state transition matrix) +# 每个状态到其他状态的转移概率,基于重组率,假设重组率为 0.01 +# 例如,从 MM 状态保持不变的概率为 0.99,转移到 MZ 或 ZZ 的概率为 0.005 +transition_probability = np.array([ + [0.99, 0.005, 0.005], # MM -> [MM, MZ, ZZ] + [0.005, 0.99, 0.005], # MZ -> [MM, MZ, ZZ] + [0.005, 0.005, 0.99] # ZZ -> [MM, MZ, ZZ] +]) + +# 输出概率矩阵 (emission probability matrix) +# 定义每个隐藏状态 (MM, MZ, ZZ) 生成观测值 (0, 1) 的概率 +# 基于测序错误率 3% +# 例如,MM 状态下观测到 1 (MM) 的概率为 0.97,观测到 0 (ZZ) 的概率为 0.03 +emission_probability = np.array([ + [0.97, 0.03], # MM -> [观测到 0 (ZZ), 观测到 1 (MM)] + [0.5, 0.5], # MZ -> [观测到 0 (ZZ), 观测到 1 (MM)] + [0.03, 0.97] # ZZ -> [观测到 0 (ZZ), 观测到 1 (MM)] +]) + +# 建立 HMM 模型 +# 使用 hmmlearn 库的 CategoricalHMM 来表示隐马尔可夫模型 +# CategoricalHMM 用于离散观测值的建模,与之前的 MultinomialHMM 有所不同 +# MultinomialHMM 新版本中进行了重大变动,要求明确设置 n_trials,且现在遵循标准的多项式分布定义 +model = hmm.CategoricalHMM(n_components=n_states, n_iter=100, tol=0.01) +model.startprob_ = start_probability # 设置初始状态概率 +model.transmat_ = transition_probability # 设置状态转移概率矩阵 +model.emissionprob_ = emission_probability # 设置输出概率矩阵 + +# 观测序列(给定的 109 个位点的测序结果) +# 将观测结果调整为二维数组,其中每一行为一个观测值 +observation_sequence = np.array([[1], [1], [1], [1], [1], [1], [1], [0], [1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], [0], + [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [1], [0], [1], [1], [1], [1], [1], [1], + [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [1], [0], [1], [0], [1], + [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], + [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], + [1], [1], [1], [1], [1], [1], [1], [1], [1]]).reshape(-1, 1) # 调整为二维数组 + +# 使用维特比算法解码最可能的状态序列 +# 使用维特比算法计算最可能的隐藏状态序列(即最可能的基因型序列) +logprob, hidden_states = model.decode(observation_sequence, algorithm="viterbi") + +# 输出结果 +# 打印最可能的基因型序列 +print("最可能的基因型序列:") +results = ",".join([states[state] for state in hidden_states]) +print(" ".join([states[state] for state in hidden_states])) +new_list= [] +for i in [states[state] for state in hidden_states]: + if i == 'ZZ': + new_list.append('0') + elif i == 'MM': + new_list.append('1') + elif i == 'MZ': + new_list.append('2') + else: + pass +print("".join(new_list)) \ No newline at end of file diff --git a/bioinfo/sum.py b/bioinfo/sum.py new file mode 100644 index 0000000..13b2992 --- /dev/null +++ b/bioinfo/sum.py @@ -0,0 +1,54 @@ +import numpy as np +from jaxtyping import Float, Array + +def needleman_wunsch(seq1: str, seq2: str, + match_score: int = 1, + mismatch_penalty: int = -1, + gap_penalty: int = -2) -> tuple[str, str, Float[Array, "len_seq1+1 len_seq2+1"]]: + # 初始化矩阵 + len_seq1, len_seq2 = len(seq1), len(seq2) + score_matrix: Float[Array, "len_seq1+1 len_seq2+1"] = np.zeros((len_seq1 + 1, len_seq2 + 1)) + + # 初始化第一行和第一列 + for i in range(len_seq1 + 1): + score_matrix[i][0] = i * gap_penalty + for j in range(len_seq2 + 1): + score_matrix[0][j] = j * gap_penalty + + # 填充得分矩阵 + for i in range(1, len_seq1 + 1): + for j in range(1, len_seq2 + 1): + match = score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_penalty) + delete = score_matrix[i - 1][j] + gap_penalty + insert = score_matrix[i][j - 1] + gap_penalty + score_matrix[i][j] = max(match, delete, insert) + + # 回溯得到比对结果 + align1, align2 = '', '' + i, j = len_seq1, len_seq2 + while i > 0 or j > 0: + current_score = score_matrix[i][j] + if i > 0 and j > 0 and (current_score == score_matrix[i - 1][j - 1] + (match_score if seq1[i - 1] == seq2[j - 1] else mismatch_penalty)): + align1 = seq1[i - 1] + align1 + align2 = seq2[j - 1] + align2 + i -= 1 + j -= 1 + elif i > 0 and (current_score == score_matrix[i - 1][j] + gap_penalty): + align1 = seq1[i - 1] + align1 + align2 = '-' + align2 + i -= 1 + else: + align1 = '-' + align1 + align2 = seq2[j - 1] + align2 + j -= 1 + + return align1, align2, score_matrix + +# 测试示例 +seq1 = "GATTACA" +seq2 = "GCATGCU" +alignment = needleman_wunsch(seq1, seq2) +print("Aligned Sequences:") +print(alignment[0]) +print(alignment[1]) +print("Alignment Score:", alignment[2]) \ No newline at end of file