add analysis code

2024-01-18 18:10:01 +08:00
parent 8064244f51
commit d300702a15
1 changed files with 209 additions and 2 deletions
--- a/test.py
+++ b/test.py
@@ -48,8 +48,215 @@ if len(runner_directories) != len(fasta_files):
 # fasta_files.append(Path('/mnt/mydrive/analysis_pdb-dev/test.fasta'))

 # # Test each FASTA file
-# for file_path in fasta_files:
-#     test_fasta_file(file_path)
+results = []
+for file_path in fasta_files:
+    results.append(test_fasta_file(file_path))
+
+# 序列少于5的pdbid 记录
+pdb_id_less5 = []
+pdb_id_less5_obj = []
+for result in results:
+    if result.sequence_num < 5:
+        pdb_id_less5.append(result.file.stem)
+        pdb_id_less5_obj.append(result)
+print(pdb_id_less5)
+
+# 展示所有的fasta的描述信息和长度
+keep_have_peptide_results = []
+peptide_length = []
+for result in results:
+    for s in result.sequences:
+        if s.sequence_length < 36:
+            print(s.header_info.description, s.sequence_length)
+            peptide_length.append(s.sequence_length)
+            keep_have_peptide_results.append(result)
+
+assert len(keep_have_peptide_results) == len(peptide_length)
+
+# 检查一下有peptides的结果中是否存在序列数目少于5的结果
+keep_have_peptide_results_less5 = []
+keep_have_peptide_results_less5_obj = []
+keep_have_peptide_results_mean5_obj = []
+keep_have_peptide_results_more5_obj = []
+keep_have_peptide_results_5_obj = []
+for result in keep_have_peptide_results:
+    if result.sequence_num < 5:
+        keep_have_peptide_results_less5.append(result.file.stem)
+        keep_have_peptide_results_less5_obj.append(result)
+    elif result.sequence_num == 5:
+        keep_have_peptide_results_mean5_obj.append(result)
+    else:
+        keep_have_peptide_results_more5_obj.append(result)
+print(f'有peptides的结果中是否存在序列数目少于5的结果: {len(keep_have_peptide_results_less5)} \n {keep_have_peptide_results_less5}')
+print(f'有peptides的结果中是否存在序列数目等于5的结果: {len(keep_have_peptide_results_mean5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_mean5_obj]}')
+print(f'有peptides的结果中是否存在序列数目大于5的结果: {len(keep_have_peptide_results_more5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_more5_obj]}')
+
+# 绘图
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# 创建 DataFrame
+data = {
+    'PDB ID': [],
+    'Sequence Count': [],
+    'Category': []
+}
+
+# 填充 DataFrame
+for result in keep_have_peptide_results_less5_obj:
+    data['PDB ID'].append(result.file.stem)
+    data['Sequence Count'].append(result.sequence_num)
+    data['Category'].append('Less than 5')
+
+for result in keep_have_peptide_results_mean5_obj:
+    data['PDB ID'].append(result.file.stem)
+    data['Sequence Count'].append(result.sequence_num)
+    data['Category'].append('Equal to 5')
+
+for result in keep_have_peptide_results_more5_obj:
+    data['PDB ID'].append(result.file.stem)
+    data['Sequence Count'].append(result.sequence_num)
+    data['Category'].append('More than 5')
+
+df = pd.DataFrame(data)
+
+# 绘制图表
+plt.figure(figsize=(10, 6))
+sns.countplot(x='Category', data=df)
+plt.title('Distribution of Sequence Counts in Peptide Results')
+plt.xlabel('Category')
+plt.ylabel('Count')
+plt.grid(True)
+
+# 保存图表为 SVG 文件
+plt.savefig('/mnt/mydrive/analysis_pdb-dev/Peptide_Sequence_Count_Distribution.svg', format='svg')
+
+# 统计当前数据集中peptides长度分布范围：
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+# 统计 peptide_length 的频数
+peptide_length_count = {length: peptide_length.count(length) for length in set(peptide_length)}
+
+# 使用 numpy.array 或 pandas.Series 创建数据
+lengths = np.array(list(peptide_length_count.keys()))
+frequencies = np.array(list(peptide_length_count.values()))
+
+# 绘制图表
+plt.figure(figsize=(10, 6))
+sns.barplot(x=lengths, y=frequencies)
+plt.title('Frequency of Peptide Lengths')
+plt.xlabel('Peptide Length')
+plt.ylabel('Frequency')
+plt.grid(True)
+
+# 保存图表为 SVG 文件
+plt.savefig('/mnt/mydrive/analysis_pdb-dev/peptide_lengths.svg', format='svg')
+
+# 关闭图表以释放内存
+plt.close()
+
+# 继续提取分析 有peptides的结果中是否存在序列数目等于5的结果中有多聚体的情况
+single = []
+multimer  =  []
+unknown = []
+
+for result in keep_have_peptide_results_mean5_obj:
+    # 非多聚体
+    if all(i.header_info.is_polymeric == 'No' for i in result.sequences):
+        single.append(result)
+    # 多聚体
+    elif all(i.header_info.is_polymeric == 'Yes' for i in result.sequences):
+        multimer.append(result)
+    # 未知或其他情况
+    else:
+        unknown.append(result)
+
+# 打印结果
+print(f"单聚体结果数量: {len(single)}")
+print(f"多聚体结果数量: {len(multimer)}")
+print(f"未知或其他情况结果数量: {len(unknown)}")
+
+single_pdbid = [i.file.stem for i in single]
+multimer_pdbid = [i.file.stem for i in multimer]
+print('单聚体结果：',','.join(sorted(single_pdbid)))
+print(100*'-')
+print('多聚体结果：',','.join(sorted(multimer_pdbid)))
+
+# 构建单(多)聚体列表
+def update_file_extension_and_check_existence(results, new_extension):
+    updated_files = []
+    for result in results:
+        # 更新文件扩展名
+        new_file = result.file.with_name(result.file.stem + new_extension)
+        # 检查文件是否存在
+        if new_file.exists():
+            updated_files.append(new_file)
+    return updated_files
+
+# 更新 single 和 multimer 列表中的文件扩展名并检查是否存在
+single_updated = update_file_extension_and_check_existence(single, '.modellerfix.pdb')
+multimer_updated = update_file_extension_and_check_existence(multimer, '.modellerfix.pdb')
+
+print("单聚体更新文件列表：")
+print([file.as_posix() for file in single_updated])
+print("多聚体更新文件列表：")
+print([file.as_posix() for file in multimer_updated])
+
+# 复制分类文件到文件夹
+import shutil
+
+def copy_files_to_directory(files, directory_name):
+    # 创建目标文件夹
+    target_dir = Path(directory_name)
+    target_dir.mkdir(exist_ok=True)
+
+    # 复制文件到目标文件夹
+    for file in files:
+        shutil.copy(file, target_dir / file.name)
+
+# 复制单聚体文件
+copy_files_to_directory(single_updated, 'single_polymeric_files')
+
+# 复制多聚体文件
+copy_files_to_directory(multimer_updated, 'multimer_polymeric_files')
+
+print("文件复制完成。")
+# 打印未知或其他情况的具体原因
+for result in unknown:
+    print(f"PDB ID: {result.file.stem}")
+    for seq in result.sequences:
+        print(f"  Chain ID: {seq.header_info.chain_ids}, Is Polymeric: {seq.header_info.is_polymeric}")
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# 数据准备
+categories = ['Single', 'Multimer', 'Unknown']
+counts = [88, 58, 0]
+
+# 创建 seaborn 条形图
+sns.set(style="whitegrid")
+plt.figure(figsize=(8, 4))
+ax = sns.barplot(x=categories, y=counts, palette="Blues_d")
+
+# 添加标题和标签
+plt.title('Distribution of Polymeric States in FASTA Files')
+plt.xlabel('Polymeric State')
+plt.ylabel('Count')
+
+# 添加数值标签
+for p in ax.patches:
+    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
+                ha='center', va='center', fontsize=10, color='gray', xytext=(0, 5),
+                textcoords='offset points')
+
+# 保存图表为 SVG 文件
+plt.savefig('/mnt/mydrive/analysis_pdb-dev/polymeric_states_distribution.svg', format='svg')
+