import numpy as np from hmmlearn import hmm # 定义状态和观察集合 # 状态集合代表不同的基因型: MM, MZ, ZZ states = ["MM", "MZ", "ZZ"] n_states = len(states) # 观测集合为 0 和 1,其中 0 表示 ZZ,1 表示 MM observations = [0, 1] n_observations = len(observations) # 初始概率分布 (initial state probabilities) # 定义每个状态的初始概率: MM: 0.4, MZ: 0.2, ZZ: 0.4 start_probability = np.array([0.4, 0.2, 0.4]) # 转移概率矩阵 (state transition matrix) # 每个状态到其他状态的转移概率,基于重组率,假设重组率为 0.01 # 例如,从 MM 状态保持不变的概率为 0.99,转移到 MZ 或 ZZ 的概率为 0.005 transition_probability = np.array([ [0.99, 0.005, 0.005], # MM -> [MM, MZ, ZZ] [0.005, 0.99, 0.005], # MZ -> [MM, MZ, ZZ] [0.005, 0.005, 0.99] # ZZ -> [MM, MZ, ZZ] ]) # 输出概率矩阵 (emission probability matrix) # 定义每个隐藏状态 (MM, MZ, ZZ) 生成观测值 (0, 1) 的概率 # 基于测序错误率 3% # 例如,MM 状态下观测到 1 (MM) 的概率为 0.97,观测到 0 (ZZ) 的概率为 0.03 emission_probability = np.array([ [0.97, 0.03], # MM -> [观测到 0 (ZZ), 观测到 1 (MM)] [0.5, 0.5], # MZ -> [观测到 0 (ZZ), 观测到 1 (MM)] [0.03, 0.97] # ZZ -> [观测到 0 (ZZ), 观测到 1 (MM)] ]) # 建立 HMM 模型 # 使用 hmmlearn 库的 CategoricalHMM 来表示隐马尔可夫模型 # CategoricalHMM 用于离散观测值的建模,与之前的 MultinomialHMM 有所不同 # MultinomialHMM 新版本中进行了重大变动,要求明确设置 n_trials,且现在遵循标准的多项式分布定义 model = hmm.CategoricalHMM(n_components=n_states, n_iter=100, tol=0.01) model.startprob_ = start_probability # 设置初始状态概率 model.transmat_ = transition_probability # 设置状态转移概率矩阵 model.emissionprob_ = emission_probability # 设置输出概率矩阵 # 观测序列(给定的 109 个位点的测序结果) # 将观测结果调整为二维数组,其中每一行为一个观测值 observation_sequence = np.array([[1], [1], [1], [1], [1], [1], [1], [0], [1], [1], [1], [1], [1], [1], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [1], [0], [1], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [1], [0], [1], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]]).reshape(-1, 1) # 调整为二维数组 # 使用维特比算法解码最可能的状态序列 # 使用维特比算法计算最可能的隐藏状态序列(即最可能的基因型序列) logprob, hidden_states = model.decode(observation_sequence, algorithm="viterbi") # 输出结果 # 打印最可能的基因型序列 print("最可能的基因型序列:") results = ",".join([states[state] for state in hidden_states]) print(" ".join([states[state] for state in hidden_states])) new_list= [] for i in [states[state] for state in hidden_states]: if i == 'ZZ': new_list.append('0') elif i == 'MM': new_list.append('1') elif i == 'MZ': new_list.append('2') else: pass print("".join(new_list))