Files
analysis_pdb/test.py
2024-02-01 15:30:01 +08:00

300 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from tcr_pmhc_complexes import FastaFile
from pathlib import Path
import glob
from analysis_pdb import PDBAnalyzer
def download_fasta_for_pdb(pdb_id, pdb_directory):
"""Download the FASTA file for a given PDB ID into the specified directory."""
pdb_analyzer = PDBAnalyzer(pdb_file=Path(pdb_directory) / f"{pdb_id}.pdb")
fasta_file = pdb_analyzer.download_fasta(pdb_id)
return fasta_file
def test_fasta_file(file_path):
fasta_file = FastaFile(file=Path(file_path))
print(f"File: {file_path}")
print(f"Number of Sequences: {fasta_file.sequence_num}\n")
for seq in fasta_file.sequences:
header_info = seq.header_info
print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}")
print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}")
print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}")
print(f"Is Polymeric: {header_info.is_polymeric if header_info.is_polymeric else 'Unknown'}")
print(f"Description: {header_info.description}")
print(f"Sequence: {seq.sequence.sequence[:30]}...")
print(f"Sequence Length: {len(seq.sequence.sequence)}\n")
return fasta_file
# Discover all runner_* directories
runner_directories = glob.glob('/mnt/mydrive/analysis_pdb-dev/pdb_test3/runner_*')
fasta_files = []
for directory in runner_directories:
pdb_file = Path(directory).joinpath(f'{directory[-4:]}.pdb')
fasta_file = Path(directory).joinpath(f'{directory[-4:]}.fasta')
if pdb_file.exists() and fasta_file.exists():
fasta_files.append(fasta_file)
else:
print(f'File {fasta_file} does not exist, download...')
ins = PDBAnalyzer(pdb_file)
ins.download_fasta()
# Check if the number of directories matches the number of PDB files
if len(runner_directories) != len(fasta_files):
print("Warning: Number of runner directories does not match number of PDB files.")
# Add the standard FASTA file to the list of files to be tested
# fasta_files.append(Path('/mnt/mydrive/analysis_pdb-dev/test.fasta'))
# # Test each FASTA file
results = []
for file_path in fasta_files:
results.append(test_fasta_file(file_path))
# 序列少于5的pdbid 记录
pdb_id_less5 = []
pdb_id_less5_obj = []
for result in results:
if result.sequence_num < 5:
pdb_id_less5.append(result.file.stem)
pdb_id_less5_obj.append(result)
print(pdb_id_less5)
# 展示所有的fasta的描述信息和长度
keep_have_peptide_results = []
peptide_length = []
for result in results:
for s in result.sequences:
if s.sequence_length < 36:
print(s.header_info.description, s.sequence_length)
peptide_length.append(s.sequence_length)
keep_have_peptide_results.append(result)
assert len(keep_have_peptide_results) == len(peptide_length)
# 检查一下有peptides的结果中是否存在序列数目少于5的结果
keep_have_peptide_results_less5 = []
keep_have_peptide_results_less5_obj = []
keep_have_peptide_results_mean5_obj = []
keep_have_peptide_results_more5_obj = []
keep_have_peptide_results_5_obj = []
for result in keep_have_peptide_results:
if result.sequence_num < 5:
keep_have_peptide_results_less5.append(result.file.stem)
keep_have_peptide_results_less5_obj.append(result)
elif result.sequence_num == 5:
keep_have_peptide_results_mean5_obj.append(result)
else:
keep_have_peptide_results_more5_obj.append(result)
print(f'有peptides的结果中是否存在序列数目少于5的结果: {len(keep_have_peptide_results_less5)} \n {keep_have_peptide_results_less5}')
print(f'有peptides的结果中是否存在序列数目等于5的结果: {len(keep_have_peptide_results_mean5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_mean5_obj]}')
print(f'有peptides的结果中是否存在序列数目大于5的结果: {len(keep_have_peptide_results_more5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_more5_obj]}')
# 绘图
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# 创建 DataFrame
data = {
'PDB ID': [],
'Sequence Count': [],
'Category': []
}
# 填充 DataFrame
for result in keep_have_peptide_results_less5_obj:
data['PDB ID'].append(result.file.stem)
data['Sequence Count'].append(result.sequence_num)
data['Category'].append('Less than 5')
for result in keep_have_peptide_results_mean5_obj:
data['PDB ID'].append(result.file.stem)
data['Sequence Count'].append(result.sequence_num)
data['Category'].append('Equal to 5')
for result in keep_have_peptide_results_more5_obj:
data['PDB ID'].append(result.file.stem)
data['Sequence Count'].append(result.sequence_num)
data['Category'].append('More than 5')
df = pd.DataFrame(data)
# 绘制图表
plt.figure(figsize=(10, 6))
sns.countplot(x='Category', data=df)
plt.title('Distribution of Sequence Counts in Peptide Results')
plt.xlabel('Category')
plt.ylabel('Count')
plt.grid(True)
# 保存图表为 SVG 文件
plt.savefig('/mnt/mydrive/analysis_pdb-dev/Peptide_Sequence_Count_Distribution.svg', format='svg')
# 统计当前数据集中peptides长度分布范围
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
# 统计 peptide_length 的频数
peptide_length_count = {length: peptide_length.count(length) for length in set(peptide_length)}
# 使用 numpy.array 或 pandas.Series 创建数据
lengths = np.array(list(peptide_length_count.keys()))
frequencies = np.array(list(peptide_length_count.values()))
# 绘制图表
plt.figure(figsize=(10, 6))
sns.barplot(x=lengths, y=frequencies)
plt.title('Frequency of Peptide Lengths')
plt.xlabel('Peptide Length')
plt.ylabel('Frequency')
plt.grid(True)
# 保存图表为 SVG 文件
plt.savefig('/mnt/mydrive/analysis_pdb-dev/peptide_lengths.svg', format='svg')
# 关闭图表以释放内存
plt.close()
# 继续提取分析 有peptides的结果中是否存在序列数目等于5的结果中有多聚体的情况
single = []
multimer = []
unknown = []
for result in keep_have_peptide_results_mean5_obj:
# 非多聚体
if all(i.header_info.is_polymeric == 'No' for i in result.sequences):
single.append(result)
# 多聚体
elif all(i.header_info.is_polymeric == 'Yes' for i in result.sequences):
multimer.append(result)
# 未知或其他情况
else:
unknown.append(result)
# 打印结果
print(f"单聚体结果数量: {len(single)}")
print(f"多聚体结果数量: {len(multimer)}")
print(f"未知或其他情况结果数量: {len(unknown)}")
single_pdbid = [i.file.stem for i in single]
multimer_pdbid = [i.file.stem for i in multimer]
print('单聚体结果:',','.join(sorted(single_pdbid)))
print(100*'-')
print('多聚体结果:',','.join(sorted(multimer_pdbid)))
# 构建单(多)聚体列表
def update_file_extension_and_check_existence(results, new_extension):
updated_files = []
for result in results:
# 更新文件扩展名
new_file = result.file.with_name(result.file.stem + new_extension)
# 检查文件是否存在
if new_file.exists():
updated_files.append(new_file)
return updated_files
# 更新 single 和 multimer 列表中的文件扩展名并检查是否存在
single_updated = update_file_extension_and_check_existence(single, '.modellerfix.pdb')
multimer_updated = update_file_extension_and_check_existence(multimer, '.modellerfix.pdb')
print("单聚体更新文件列表:")
print([file.as_posix() for file in single_updated])
print("多聚体更新文件列表:")
print([file.as_posix() for file in multimer_updated])
# 复制分类文件到文件夹
import shutil
def copy_files_to_directory(files, directory_name):
# 创建目标文件夹
target_dir = Path(directory_name)
target_dir.mkdir(exist_ok=True)
# 复制文件到目标文件夹
for file in files:
shutil.copy(file, target_dir / file.name)
# 复制单聚体文件
copy_files_to_directory(single_updated, 'single_polymeric_files')
# 复制多聚体文件
copy_files_to_directory(multimer_updated, 'multimer_polymeric_files')
print("文件复制完成。")
# 打印未知或其他情况的具体原因
for result in unknown:
print(f"PDB ID: {result.file.stem}")
for seq in result.sequences:
print(f" Chain ID: {seq.header_info.chain_ids}, Is Polymeric: {seq.header_info.is_polymeric}")
import seaborn as sns
import matplotlib.pyplot as plt
# 数据准备
categories = ['Single', 'Multimer', 'Unknown']
counts = [len(single), len(multimer), len(unknown)]
# 创建 seaborn 条形图
sns.set(style="whitegrid")
plt.figure(figsize=(8, 4))
ax = sns.barplot(x=categories, y=counts, palette="Blues_d")
# 添加标题和标签
plt.title('Distribution of Polymeric States in FASTA Files')
plt.xlabel('Polymeric State')
plt.ylabel('Count')
# 添加数值标签
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', fontsize=10, color='gray', xytext=(0, 5),
textcoords='offset points')
# 保存图表为 SVG 文件
plt.savefig('/mnt/mydrive/analysis_pdb-dev/polymeric_states_distribution.svg', format='svg')
from analysis_pdb import PDBAnalyzer
import requests
for i in multimer_pdbid:
url = f"https://www.rcsb.org/fasta/entry/{i.upper()}"
response = requests.get(url)
if output_file.exists():
continue
output_file = Path('multi') / f"{i}.fasta"
if response.status_code == 200:
with open(output_file, 'w') as file:
file.write(response.text)
if not output_file.exists():
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
else:
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
for i in multimer_pdbid:
shutil.copy(Path('fixed') / f'{i}.pdb', Path('multi_pdb')/ f'{i}.pdb')
for i in single_pdbid:
url = f"https://www.rcsb.org/fasta/entry/{i.upper()}"
response = requests.get(url)
output_file = Path('single') / f"{i}.fasta"
if output_file.exists():
continue
if response.status_code == 200:
with open(output_file, 'w') as file:
file.write(response.text)
if not output_file.exists():
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
else:
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
for i in single_pdbid:
shutil.copy(Path('fixed') / f'{i}.pdb', Path('single_pdb')/ f'{i}.pdb')
from pymol import cmd
cmd.fetch('4ozi', type='pdb')