Files
gromacs_docker/bioinfo/MHH.py
2024-10-27 15:29:31 +08:00

73 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import numpy as np
from hmmlearn import hmm
# 定义状态和观察集合
# 状态集合代表不同的基因型: MM, MZ, ZZ
states = ["MM", "MZ", "ZZ"]
n_states = len(states)
# 观测集合为 0 和 1其中 0 表示 ZZ1 表示 MM
observations = [0, 1]
n_observations = len(observations)
# 初始概率分布 (initial state probabilities)
# 定义每个状态的初始概率: MM: 0.4, MZ: 0.2, ZZ: 0.4
start_probability = np.array([0.4, 0.2, 0.4])
# 转移概率矩阵 (state transition matrix)
# 每个状态到其他状态的转移概率,基于重组率,假设重组率为 0.01
# 例如,从 MM 状态保持不变的概率为 0.99,转移到 MZ 或 ZZ 的概率为 0.005
transition_probability = np.array([
[0.99, 0.005, 0.005], # MM -> [MM, MZ, ZZ]
[0.005, 0.99, 0.005], # MZ -> [MM, MZ, ZZ]
[0.005, 0.005, 0.99] # ZZ -> [MM, MZ, ZZ]
])
# 输出概率矩阵 (emission probability matrix)
# 定义每个隐藏状态 (MM, MZ, ZZ) 生成观测值 (0, 1) 的概率
# 基于测序错误率 3%
# 例如MM 状态下观测到 1 (MM) 的概率为 0.97,观测到 0 (ZZ) 的概率为 0.03
emission_probability = np.array([
[0.97, 0.03], # MM -> [观测到 0 (ZZ), 观测到 1 (MM)]
[0.5, 0.5], # MZ -> [观测到 0 (ZZ), 观测到 1 (MM)]
[0.03, 0.97] # ZZ -> [观测到 0 (ZZ), 观测到 1 (MM)]
])
# 建立 HMM 模型
# 使用 hmmlearn 库的 CategoricalHMM 来表示隐马尔可夫模型
# CategoricalHMM 用于离散观测值的建模,与之前的 MultinomialHMM 有所不同
# MultinomialHMM 新版本中进行了重大变动,要求明确设置 n_trials且现在遵循标准的多项式分布定义
model = hmm.CategoricalHMM(n_components=n_states, n_iter=100, tol=0.01)
model.startprob_ = start_probability # 设置初始状态概率
model.transmat_ = transition_probability # 设置状态转移概率矩阵
model.emissionprob_ = emission_probability # 设置输出概率矩阵
# 观测序列(给定的 109 个位点的测序结果)
# 将观测结果调整为二维数组,其中每一行为一个观测值
observation_sequence = np.array([[1], [1], [1], [1], [1], [1], [1], [0], [1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], [0],
[1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [1], [0], [1], [1], [1], [1], [1], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [1], [0], [1], [0], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1]]).reshape(-1, 1) # 调整为二维数组
# 使用维特比算法解码最可能的状态序列
# 使用维特比算法计算最可能的隐藏状态序列(即最可能的基因型序列)
logprob, hidden_states = model.decode(observation_sequence, algorithm="viterbi")
# 输出结果
# 打印最可能的基因型序列
print("最可能的基因型序列:")
results = ",".join([states[state] for state in hidden_states])
print(" ".join([states[state] for state in hidden_states]))
new_list= []
for i in [states[state] for state in hidden_states]:
if i == 'ZZ':
new_list.append('0')
elif i == 'MM':
new_list.append('1')
elif i == 'MZ':
new_list.append('2')
else:
pass
print("".join(new_list))