from tcr_pmhc_complexes import FastaFile from pathlib import Path import glob from analysis_pdb import PDBAnalyzer def download_fasta_for_pdb(pdb_id, pdb_directory): """Download the FASTA file for a given PDB ID into the specified directory.""" pdb_analyzer = PDBAnalyzer(pdb_file=Path(pdb_directory) / f"{pdb_id}.pdb") fasta_file = pdb_analyzer.download_fasta(pdb_id) return fasta_file def test_fasta_file(file_path): fasta_file = FastaFile(file=Path(file_path)) print(f"File: {file_path}") print(f"Number of Sequences: {fasta_file.sequence_num}\n") for seq in fasta_file.sequences: header_info = seq.header_info print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}") print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}") print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}") print(f"Is Polymeric: {header_info.is_polymeric if header_info.is_polymeric else 'Unknown'}") print(f"Description: {header_info.description}") print(f"Sequence: {seq.sequence.sequence[:30]}...") print(f"Sequence Length: {len(seq.sequence.sequence)}\n") return fasta_file # Discover all runner_* directories runner_directories = glob.glob('/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_*') fasta_files = [] for directory in runner_directories: pdb_file = Path(directory).joinpath(f'{directory[-4:]}.pdb') fasta_file = Path(directory).joinpath(f'{directory[-4:]}.fasta') if pdb_file.exists() and fasta_file.exists(): fasta_files.append(fasta_file) else: print(f'File {fasta_file} does not exist, download...') ins = PDBAnalyzer(pdb_file) ins.download_fasta() # Check if the number of directories matches the number of PDB files if len(runner_directories) != len(fasta_files): print("Warning: Number of runner directories does not match number of PDB files.") # Add the standard FASTA file to the list of files to be tested # fasta_files.append(Path('/mnt/mydrive/analysis_pdb-dev/test.fasta')) # # Test each FASTA file results = [] for file_path in fasta_files: results.append(test_fasta_file(file_path)) # 序列少于5的pdbid 记录 pdb_id_less5 = [] pdb_id_less5_obj = [] for result in results: if result.sequence_num < 5: pdb_id_less5.append(result.file.stem) pdb_id_less5_obj.append(result) print(pdb_id_less5) # 展示所有的fasta的描述信息和长度 keep_have_peptide_results = [] peptide_length = [] for result in results: for s in result.sequences: if s.sequence_length < 36: print(s.header_info.description, s.sequence_length) peptide_length.append(s.sequence_length) keep_have_peptide_results.append(result) assert len(keep_have_peptide_results) == len(peptide_length) # 检查一下有peptides的结果中是否存在序列数目少于5的结果 keep_have_peptide_results_less5 = [] keep_have_peptide_results_less5_obj = [] keep_have_peptide_results_mean5_obj = [] keep_have_peptide_results_more5_obj = [] keep_have_peptide_results_5_obj = [] for result in keep_have_peptide_results: if result.sequence_num < 5: keep_have_peptide_results_less5.append(result.file.stem) keep_have_peptide_results_less5_obj.append(result) elif result.sequence_num == 5: keep_have_peptide_results_mean5_obj.append(result) else: keep_have_peptide_results_more5_obj.append(result) print(f'有peptides的结果中是否存在序列数目少于5的结果: {len(keep_have_peptide_results_less5)} \n {keep_have_peptide_results_less5}') print(f'有peptides的结果中是否存在序列数目等于5的结果: {len(keep_have_peptide_results_mean5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_mean5_obj]}') print(f'有peptides的结果中是否存在序列数目大于5的结果: {len(keep_have_peptide_results_more5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_more5_obj]}') # 绘图 import pandas as pd import seaborn as sns import matplotlib.pyplot as plt # 创建 DataFrame data = { 'PDB ID': [], 'Sequence Count': [], 'Category': [] } # 填充 DataFrame for result in keep_have_peptide_results_less5_obj: data['PDB ID'].append(result.file.stem) data['Sequence Count'].append(result.sequence_num) data['Category'].append('Less than 5') for result in keep_have_peptide_results_mean5_obj: data['PDB ID'].append(result.file.stem) data['Sequence Count'].append(result.sequence_num) data['Category'].append('Equal to 5') for result in keep_have_peptide_results_more5_obj: data['PDB ID'].append(result.file.stem) data['Sequence Count'].append(result.sequence_num) data['Category'].append('More than 5') df = pd.DataFrame(data) # 绘制图表 plt.figure(figsize=(10, 6)) sns.countplot(x='Category', data=df) plt.title('Distribution of Sequence Counts in Peptide Results') plt.xlabel('Category') plt.ylabel('Count') plt.grid(True) # 保存图表为 SVG 文件 plt.savefig('/mnt/mydrive/analysis_pdb-dev/Peptide_Sequence_Count_Distribution.svg', format='svg') # 统计当前数据集中peptides长度分布范围: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np # 统计 peptide_length 的频数 peptide_length_count = {length: peptide_length.count(length) for length in set(peptide_length)} # 使用 numpy.array 或 pandas.Series 创建数据 lengths = np.array(list(peptide_length_count.keys())) frequencies = np.array(list(peptide_length_count.values())) # 绘制图表 plt.figure(figsize=(10, 6)) sns.barplot(x=lengths, y=frequencies) plt.title('Frequency of Peptide Lengths') plt.xlabel('Peptide Length') plt.ylabel('Frequency') plt.grid(True) # 保存图表为 SVG 文件 plt.savefig('/mnt/mydrive/analysis_pdb-dev/peptide_lengths.svg', format='svg') # 关闭图表以释放内存 plt.close() # 继续提取分析 有peptides的结果中是否存在序列数目等于5的结果中有多聚体的情况 single = [] multimer = [] unknown = [] for result in keep_have_peptide_results_mean5_obj: # 非多聚体 if all(i.header_info.is_polymeric == 'No' for i in result.sequences): single.append(result) # 多聚体 elif all(i.header_info.is_polymeric == 'Yes' for i in result.sequences): multimer.append(result) # 未知或其他情况 else: unknown.append(result) # 打印结果 print(f"单聚体结果数量: {len(single)}") print(f"多聚体结果数量: {len(multimer)}") print(f"未知或其他情况结果数量: {len(unknown)}") single_pdbid = [i.file.stem for i in single] multimer_pdbid = [i.file.stem for i in multimer] print('单聚体结果:',','.join(sorted(single_pdbid))) print(100*'-') print('多聚体结果:',','.join(sorted(multimer_pdbid))) # 构建单(多)聚体列表 def update_file_extension_and_check_existence(results, new_extension): updated_files = [] for result in results: # 更新文件扩展名 new_file = result.file.with_name(result.file.stem + new_extension) # 检查文件是否存在 if new_file.exists(): updated_files.append(new_file) return updated_files # 更新 single 和 multimer 列表中的文件扩展名并检查是否存在 single_updated = update_file_extension_and_check_existence(single, '.modellerfix.pdb') multimer_updated = update_file_extension_and_check_existence(multimer, '.modellerfix.pdb') print("单聚体更新文件列表:") print([file.as_posix() for file in single_updated]) print("多聚体更新文件列表:") print([file.as_posix() for file in multimer_updated]) # 复制分类文件到文件夹 import shutil def copy_files_to_directory(files, directory_name): # 创建目标文件夹 target_dir = Path(directory_name) target_dir.mkdir(exist_ok=True) # 复制文件到目标文件夹 for file in files: shutil.copy(file, target_dir / file.name) # 复制单聚体文件 copy_files_to_directory(single_updated, 'single_polymeric_files') # 复制多聚体文件 copy_files_to_directory(multimer_updated, 'multimer_polymeric_files') print("文件复制完成。") # 打印未知或其他情况的具体原因 for result in unknown: print(f"PDB ID: {result.file.stem}") for seq in result.sequences: print(f" Chain ID: {seq.header_info.chain_ids}, Is Polymeric: {seq.header_info.is_polymeric}") import seaborn as sns import matplotlib.pyplot as plt # 数据准备 categories = ['Single', 'Multimer', 'Unknown'] counts = [88, 58, 0] # 创建 seaborn 条形图 sns.set(style="whitegrid") plt.figure(figsize=(8, 4)) ax = sns.barplot(x=categories, y=counts, palette="Blues_d") # 添加标题和标签 plt.title('Distribution of Polymeric States in FASTA Files') plt.xlabel('Polymeric State') plt.ylabel('Count') # 添加数值标签 for p in ax.patches: ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='gray', xytext=(0, 5), textcoords='offset points') # 保存图表为 SVG 文件 plt.savefig('/mnt/mydrive/analysis_pdb-dev/polymeric_states_distribution.svg', format='svg')