diff --git a/test.py b/test.py index a088c36..84be3e3 100644 --- a/test.py +++ b/test.py @@ -48,8 +48,215 @@ if len(runner_directories) != len(fasta_files): # fasta_files.append(Path('/mnt/mydrive/analysis_pdb-dev/test.fasta')) # # Test each FASTA file -# for file_path in fasta_files: -# test_fasta_file(file_path) +results = [] +for file_path in fasta_files: + results.append(test_fasta_file(file_path)) + +# 序列少于5的pdbid 记录 +pdb_id_less5 = [] +pdb_id_less5_obj = [] +for result in results: + if result.sequence_num < 5: + pdb_id_less5.append(result.file.stem) + pdb_id_less5_obj.append(result) +print(pdb_id_less5) + +# 展示所有的fasta的描述信息和长度 +keep_have_peptide_results = [] +peptide_length = [] +for result in results: + for s in result.sequences: + if s.sequence_length < 36: + print(s.header_info.description, s.sequence_length) + peptide_length.append(s.sequence_length) + keep_have_peptide_results.append(result) + +assert len(keep_have_peptide_results) == len(peptide_length) + +# 检查一下有peptides的结果中是否存在序列数目少于5的结果 +keep_have_peptide_results_less5 = [] +keep_have_peptide_results_less5_obj = [] +keep_have_peptide_results_mean5_obj = [] +keep_have_peptide_results_more5_obj = [] +keep_have_peptide_results_5_obj = [] +for result in keep_have_peptide_results: + if result.sequence_num < 5: + keep_have_peptide_results_less5.append(result.file.stem) + keep_have_peptide_results_less5_obj.append(result) + elif result.sequence_num == 5: + keep_have_peptide_results_mean5_obj.append(result) + else: + keep_have_peptide_results_more5_obj.append(result) +print(f'有peptides的结果中是否存在序列数目少于5的结果: {len(keep_have_peptide_results_less5)} \n {keep_have_peptide_results_less5}') +print(f'有peptides的结果中是否存在序列数目等于5的结果: {len(keep_have_peptide_results_mean5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_mean5_obj]}') +print(f'有peptides的结果中是否存在序列数目大于5的结果: {len(keep_have_peptide_results_more5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_more5_obj]}') + +# 绘图 +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +# 创建 DataFrame +data = { + 'PDB ID': [], + 'Sequence Count': [], + 'Category': [] +} + +# 填充 DataFrame +for result in keep_have_peptide_results_less5_obj: + data['PDB ID'].append(result.file.stem) + data['Sequence Count'].append(result.sequence_num) + data['Category'].append('Less than 5') + +for result in keep_have_peptide_results_mean5_obj: + data['PDB ID'].append(result.file.stem) + data['Sequence Count'].append(result.sequence_num) + data['Category'].append('Equal to 5') + +for result in keep_have_peptide_results_more5_obj: + data['PDB ID'].append(result.file.stem) + data['Sequence Count'].append(result.sequence_num) + data['Category'].append('More than 5') + +df = pd.DataFrame(data) + +# 绘制图表 +plt.figure(figsize=(10, 6)) +sns.countplot(x='Category', data=df) +plt.title('Distribution of Sequence Counts in Peptide Results') +plt.xlabel('Category') +plt.ylabel('Count') +plt.grid(True) + +# 保存图表为 SVG 文件 +plt.savefig('/mnt/mydrive/analysis_pdb-dev/Peptide_Sequence_Count_Distribution.svg', format='svg') + +# 统计当前数据集中peptides长度分布范围: +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np + +# 统计 peptide_length 的频数 +peptide_length_count = {length: peptide_length.count(length) for length in set(peptide_length)} + +# 使用 numpy.array 或 pandas.Series 创建数据 +lengths = np.array(list(peptide_length_count.keys())) +frequencies = np.array(list(peptide_length_count.values())) + +# 绘制图表 +plt.figure(figsize=(10, 6)) +sns.barplot(x=lengths, y=frequencies) +plt.title('Frequency of Peptide Lengths') +plt.xlabel('Peptide Length') +plt.ylabel('Frequency') +plt.grid(True) + +# 保存图表为 SVG 文件 +plt.savefig('/mnt/mydrive/analysis_pdb-dev/peptide_lengths.svg', format='svg') + +# 关闭图表以释放内存 +plt.close() + +# 继续提取分析 有peptides的结果中是否存在序列数目等于5的结果中有多聚体的情况 +single = [] +multimer = [] +unknown = [] + +for result in keep_have_peptide_results_mean5_obj: + # 非多聚体 + if all(i.header_info.is_polymeric == 'No' for i in result.sequences): + single.append(result) + # 多聚体 + elif all(i.header_info.is_polymeric == 'Yes' for i in result.sequences): + multimer.append(result) + # 未知或其他情况 + else: + unknown.append(result) + +# 打印结果 +print(f"单聚体结果数量: {len(single)}") +print(f"多聚体结果数量: {len(multimer)}") +print(f"未知或其他情况结果数量: {len(unknown)}") + +single_pdbid = [i.file.stem for i in single] +multimer_pdbid = [i.file.stem for i in multimer] +print('单聚体结果:',','.join(sorted(single_pdbid))) +print(100*'-') +print('多聚体结果:',','.join(sorted(multimer_pdbid))) + +# 构建单(多)聚体列表 +def update_file_extension_and_check_existence(results, new_extension): + updated_files = [] + for result in results: + # 更新文件扩展名 + new_file = result.file.with_name(result.file.stem + new_extension) + # 检查文件是否存在 + if new_file.exists(): + updated_files.append(new_file) + return updated_files + +# 更新 single 和 multimer 列表中的文件扩展名并检查是否存在 +single_updated = update_file_extension_and_check_existence(single, '.modellerfix.pdb') +multimer_updated = update_file_extension_and_check_existence(multimer, '.modellerfix.pdb') + +print("单聚体更新文件列表:") +print([file.as_posix() for file in single_updated]) +print("多聚体更新文件列表:") +print([file.as_posix() for file in multimer_updated]) + +# 复制分类文件到文件夹 +import shutil + +def copy_files_to_directory(files, directory_name): + # 创建目标文件夹 + target_dir = Path(directory_name) + target_dir.mkdir(exist_ok=True) + + # 复制文件到目标文件夹 + for file in files: + shutil.copy(file, target_dir / file.name) + +# 复制单聚体文件 +copy_files_to_directory(single_updated, 'single_polymeric_files') + +# 复制多聚体文件 +copy_files_to_directory(multimer_updated, 'multimer_polymeric_files') + +print("文件复制完成。") +# 打印未知或其他情况的具体原因 +for result in unknown: + print(f"PDB ID: {result.file.stem}") + for seq in result.sequences: + print(f" Chain ID: {seq.header_info.chain_ids}, Is Polymeric: {seq.header_info.is_polymeric}") + +import seaborn as sns +import matplotlib.pyplot as plt + +# 数据准备 +categories = ['Single', 'Multimer', 'Unknown'] +counts = [88, 58, 0] + +# 创建 seaborn 条形图 +sns.set(style="whitegrid") +plt.figure(figsize=(8, 4)) +ax = sns.barplot(x=categories, y=counts, palette="Blues_d") + +# 添加标题和标签 +plt.title('Distribution of Polymeric States in FASTA Files') +plt.xlabel('Polymeric State') +plt.ylabel('Count') + +# 添加数值标签 +for p in ax.patches: + ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), + ha='center', va='center', fontsize=10, color='gray', xytext=(0, 5), + textcoords='offset points') + +# 保存图表为 SVG 文件 +plt.savefig('/mnt/mydrive/analysis_pdb-dev/polymeric_states_distribution.svg', format='svg') +