300 lines
10 KiB
Python
300 lines
10 KiB
Python
from tcr_pmhc_complexes import FastaFile
|
||
from pathlib import Path
|
||
import glob
|
||
from analysis_pdb import PDBAnalyzer
|
||
|
||
def download_fasta_for_pdb(pdb_id, pdb_directory):
|
||
"""Download the FASTA file for a given PDB ID into the specified directory."""
|
||
pdb_analyzer = PDBAnalyzer(pdb_file=Path(pdb_directory) / f"{pdb_id}.pdb")
|
||
fasta_file = pdb_analyzer.download_fasta(pdb_id)
|
||
return fasta_file
|
||
|
||
def test_fasta_file(file_path):
|
||
fasta_file = FastaFile(file=Path(file_path))
|
||
|
||
print(f"File: {file_path}")
|
||
print(f"Number of Sequences: {fasta_file.sequence_num}\n")
|
||
|
||
for seq in fasta_file.sequences:
|
||
header_info = seq.header_info
|
||
print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}")
|
||
print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}")
|
||
print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}")
|
||
print(f"Is Polymeric: {header_info.is_polymeric if header_info.is_polymeric else 'Unknown'}")
|
||
print(f"Description: {header_info.description}")
|
||
print(f"Sequence: {seq.sequence.sequence[:30]}...")
|
||
print(f"Sequence Length: {len(seq.sequence.sequence)}\n")
|
||
return fasta_file
|
||
|
||
# Discover all runner_* directories
|
||
runner_directories = glob.glob('/mnt/mydrive/analysis_pdb-dev/pdb_test3/runner_*')
|
||
fasta_files = []
|
||
|
||
for directory in runner_directories:
|
||
pdb_file = Path(directory).joinpath(f'{directory[-4:]}.pdb')
|
||
fasta_file = Path(directory).joinpath(f'{directory[-4:]}.fasta')
|
||
if pdb_file.exists() and fasta_file.exists():
|
||
fasta_files.append(fasta_file)
|
||
else:
|
||
print(f'File {fasta_file} does not exist, download...')
|
||
ins = PDBAnalyzer(pdb_file)
|
||
ins.download_fasta()
|
||
|
||
# Check if the number of directories matches the number of PDB files
|
||
if len(runner_directories) != len(fasta_files):
|
||
print("Warning: Number of runner directories does not match number of PDB files.")
|
||
|
||
# Add the standard FASTA file to the list of files to be tested
|
||
# fasta_files.append(Path('/mnt/mydrive/analysis_pdb-dev/test.fasta'))
|
||
|
||
# # Test each FASTA file
|
||
results = []
|
||
for file_path in fasta_files:
|
||
results.append(test_fasta_file(file_path))
|
||
|
||
# 序列少于5的pdbid 记录
|
||
pdb_id_less5 = []
|
||
pdb_id_less5_obj = []
|
||
for result in results:
|
||
if result.sequence_num < 5:
|
||
pdb_id_less5.append(result.file.stem)
|
||
pdb_id_less5_obj.append(result)
|
||
print(pdb_id_less5)
|
||
|
||
# 展示所有的fasta的描述信息和长度
|
||
keep_have_peptide_results = []
|
||
peptide_length = []
|
||
for result in results:
|
||
for s in result.sequences:
|
||
if s.sequence_length < 36:
|
||
print(s.header_info.description, s.sequence_length)
|
||
peptide_length.append(s.sequence_length)
|
||
keep_have_peptide_results.append(result)
|
||
|
||
assert len(keep_have_peptide_results) == len(peptide_length)
|
||
|
||
# 检查一下有peptides的结果中是否存在序列数目少于5的结果
|
||
keep_have_peptide_results_less5 = []
|
||
keep_have_peptide_results_less5_obj = []
|
||
keep_have_peptide_results_mean5_obj = []
|
||
keep_have_peptide_results_more5_obj = []
|
||
keep_have_peptide_results_5_obj = []
|
||
for result in keep_have_peptide_results:
|
||
if result.sequence_num < 5:
|
||
keep_have_peptide_results_less5.append(result.file.stem)
|
||
keep_have_peptide_results_less5_obj.append(result)
|
||
elif result.sequence_num == 5:
|
||
keep_have_peptide_results_mean5_obj.append(result)
|
||
else:
|
||
keep_have_peptide_results_more5_obj.append(result)
|
||
print(f'有peptides的结果中是否存在序列数目少于5的结果: {len(keep_have_peptide_results_less5)} \n {keep_have_peptide_results_less5}')
|
||
print(f'有peptides的结果中是否存在序列数目等于5的结果: {len(keep_have_peptide_results_mean5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_mean5_obj]}')
|
||
print(f'有peptides的结果中是否存在序列数目大于5的结果: {len(keep_have_peptide_results_more5_obj)} \n {[i.file.stem for i in keep_have_peptide_results_more5_obj]}')
|
||
|
||
# 绘图
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
import matplotlib.pyplot as plt
|
||
|
||
# 创建 DataFrame
|
||
data = {
|
||
'PDB ID': [],
|
||
'Sequence Count': [],
|
||
'Category': []
|
||
}
|
||
|
||
# 填充 DataFrame
|
||
for result in keep_have_peptide_results_less5_obj:
|
||
data['PDB ID'].append(result.file.stem)
|
||
data['Sequence Count'].append(result.sequence_num)
|
||
data['Category'].append('Less than 5')
|
||
|
||
for result in keep_have_peptide_results_mean5_obj:
|
||
data['PDB ID'].append(result.file.stem)
|
||
data['Sequence Count'].append(result.sequence_num)
|
||
data['Category'].append('Equal to 5')
|
||
|
||
for result in keep_have_peptide_results_more5_obj:
|
||
data['PDB ID'].append(result.file.stem)
|
||
data['Sequence Count'].append(result.sequence_num)
|
||
data['Category'].append('More than 5')
|
||
|
||
df = pd.DataFrame(data)
|
||
|
||
# 绘制图表
|
||
plt.figure(figsize=(10, 6))
|
||
sns.countplot(x='Category', data=df)
|
||
plt.title('Distribution of Sequence Counts in Peptide Results')
|
||
plt.xlabel('Category')
|
||
plt.ylabel('Count')
|
||
plt.grid(True)
|
||
|
||
# 保存图表为 SVG 文件
|
||
plt.savefig('/mnt/mydrive/analysis_pdb-dev/Peptide_Sequence_Count_Distribution.svg', format='svg')
|
||
|
||
# 统计当前数据集中peptides长度分布范围:
|
||
import pandas as pd
|
||
import seaborn as sns
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
|
||
# 统计 peptide_length 的频数
|
||
peptide_length_count = {length: peptide_length.count(length) for length in set(peptide_length)}
|
||
|
||
# 使用 numpy.array 或 pandas.Series 创建数据
|
||
lengths = np.array(list(peptide_length_count.keys()))
|
||
frequencies = np.array(list(peptide_length_count.values()))
|
||
|
||
# 绘制图表
|
||
plt.figure(figsize=(10, 6))
|
||
sns.barplot(x=lengths, y=frequencies)
|
||
plt.title('Frequency of Peptide Lengths')
|
||
plt.xlabel('Peptide Length')
|
||
plt.ylabel('Frequency')
|
||
plt.grid(True)
|
||
|
||
# 保存图表为 SVG 文件
|
||
plt.savefig('/mnt/mydrive/analysis_pdb-dev/peptide_lengths.svg', format='svg')
|
||
|
||
# 关闭图表以释放内存
|
||
plt.close()
|
||
|
||
# 继续提取分析 有peptides的结果中是否存在序列数目等于5的结果中有多聚体的情况
|
||
single = []
|
||
multimer = []
|
||
unknown = []
|
||
|
||
for result in keep_have_peptide_results_mean5_obj:
|
||
# 非多聚体
|
||
if all(i.header_info.is_polymeric == 'No' for i in result.sequences):
|
||
single.append(result)
|
||
# 多聚体
|
||
elif all(i.header_info.is_polymeric == 'Yes' for i in result.sequences):
|
||
multimer.append(result)
|
||
# 未知或其他情况
|
||
else:
|
||
unknown.append(result)
|
||
|
||
# 打印结果
|
||
print(f"单聚体结果数量: {len(single)}")
|
||
print(f"多聚体结果数量: {len(multimer)}")
|
||
print(f"未知或其他情况结果数量: {len(unknown)}")
|
||
|
||
single_pdbid = [i.file.stem for i in single]
|
||
multimer_pdbid = [i.file.stem for i in multimer]
|
||
print('单聚体结果:',','.join(sorted(single_pdbid)))
|
||
print(100*'-')
|
||
print('多聚体结果:',','.join(sorted(multimer_pdbid)))
|
||
|
||
# 构建单(多)聚体列表
|
||
def update_file_extension_and_check_existence(results, new_extension):
|
||
updated_files = []
|
||
for result in results:
|
||
# 更新文件扩展名
|
||
new_file = result.file.with_name(result.file.stem + new_extension)
|
||
# 检查文件是否存在
|
||
if new_file.exists():
|
||
updated_files.append(new_file)
|
||
return updated_files
|
||
|
||
# 更新 single 和 multimer 列表中的文件扩展名并检查是否存在
|
||
single_updated = update_file_extension_and_check_existence(single, '.modellerfix.pdb')
|
||
multimer_updated = update_file_extension_and_check_existence(multimer, '.modellerfix.pdb')
|
||
|
||
print("单聚体更新文件列表:")
|
||
print([file.as_posix() for file in single_updated])
|
||
print("多聚体更新文件列表:")
|
||
print([file.as_posix() for file in multimer_updated])
|
||
|
||
# 复制分类文件到文件夹
|
||
import shutil
|
||
|
||
def copy_files_to_directory(files, directory_name):
|
||
# 创建目标文件夹
|
||
target_dir = Path(directory_name)
|
||
target_dir.mkdir(exist_ok=True)
|
||
|
||
# 复制文件到目标文件夹
|
||
for file in files:
|
||
shutil.copy(file, target_dir / file.name)
|
||
|
||
# 复制单聚体文件
|
||
copy_files_to_directory(single_updated, 'single_polymeric_files')
|
||
|
||
# 复制多聚体文件
|
||
copy_files_to_directory(multimer_updated, 'multimer_polymeric_files')
|
||
|
||
print("文件复制完成。")
|
||
# 打印未知或其他情况的具体原因
|
||
for result in unknown:
|
||
print(f"PDB ID: {result.file.stem}")
|
||
for seq in result.sequences:
|
||
print(f" Chain ID: {seq.header_info.chain_ids}, Is Polymeric: {seq.header_info.is_polymeric}")
|
||
|
||
import seaborn as sns
|
||
import matplotlib.pyplot as plt
|
||
|
||
# 数据准备
|
||
categories = ['Single', 'Multimer', 'Unknown']
|
||
counts = [len(single), len(multimer), len(unknown)]
|
||
|
||
# 创建 seaborn 条形图
|
||
sns.set(style="whitegrid")
|
||
plt.figure(figsize=(8, 4))
|
||
ax = sns.barplot(x=categories, y=counts, palette="Blues_d")
|
||
|
||
# 添加标题和标签
|
||
plt.title('Distribution of Polymeric States in FASTA Files')
|
||
plt.xlabel('Polymeric State')
|
||
plt.ylabel('Count')
|
||
|
||
# 添加数值标签
|
||
for p in ax.patches:
|
||
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
|
||
ha='center', va='center', fontsize=10, color='gray', xytext=(0, 5),
|
||
textcoords='offset points')
|
||
|
||
# 保存图表为 SVG 文件
|
||
plt.savefig('/mnt/mydrive/analysis_pdb-dev/polymeric_states_distribution.svg', format='svg')
|
||
|
||
|
||
from analysis_pdb import PDBAnalyzer
|
||
import requests
|
||
|
||
for i in multimer_pdbid:
|
||
url = f"https://www.rcsb.org/fasta/entry/{i.upper()}"
|
||
response = requests.get(url)
|
||
if output_file.exists():
|
||
continue
|
||
output_file = Path('multi') / f"{i}.fasta"
|
||
if response.status_code == 200:
|
||
with open(output_file, 'w') as file:
|
||
file.write(response.text)
|
||
if not output_file.exists():
|
||
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
|
||
else:
|
||
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
|
||
|
||
for i in multimer_pdbid:
|
||
shutil.copy(Path('fixed') / f'{i}.pdb', Path('multi_pdb')/ f'{i}.pdb')
|
||
|
||
for i in single_pdbid:
|
||
url = f"https://www.rcsb.org/fasta/entry/{i.upper()}"
|
||
response = requests.get(url)
|
||
output_file = Path('single') / f"{i}.fasta"
|
||
if output_file.exists():
|
||
continue
|
||
if response.status_code == 200:
|
||
with open(output_file, 'w') as file:
|
||
file.write(response.text)
|
||
if not output_file.exists():
|
||
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
|
||
else:
|
||
raise Exception(f"Failed to download FASTA file for PDB ID {i}")
|
||
|
||
for i in single_pdbid:
|
||
shutil.copy(Path('fixed') / f'{i}.pdb', Path('single_pdb')/ f'{i}.pdb')
|
||
|
||
from pymol import cmd
|
||
|
||
cmd.fetch('4ozi', type='pdb') |