Files
analysis_pdb/tcr_pmhc_complexes.py
2024-01-18 11:33:36 +08:00

129 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :tcr_pmhc_complexes.py
@Description: : 用于fasta文件构建数据集
@Date :2024/01/17 10:24:08
@Author :lyzeng
@Email :pylyzeng@gmail.com
@version :1.0
'''
from Bio.Data import IUPACData
from pydantic import BaseModel, Field, FilePath, field_validator
from typing import Optional, Dict, List, Any, Union
from pathlib import Path
# 使用 BioPython 导入氨基酸缩写
AMINO_ACIDS = set(IUPACData.protein_letters)
class BaseProteinSequence(BaseModel):
sequence: str
@field_validator('sequence')
def validate_amino_acids(cls, v: str, values, **kwargs):
if not set(v).issubset(AMINO_ACIDS):
raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
return v
class ProteinSequence(BaseModel):
label_asym_id: str # PDB 分配的链 ID
auth_asym_id: str # 作者选择的链 ID
sequence: str
is_id_consistent: bool = Field(default=False) # ID 是否一致
@property
def is_id_consistent(self) -> bool:
# 检查两个 ID 是否一致
return self.label_asym_id == self.auth_asym_id
def display_chain_id(self) -> str:
"""
显示链的 ID。如果 PDB ID 和作者 ID 不一致,则以 "label_asym_id [auth auth_asym_id]" 的格式显示。
"""
if not self.is_id_consistent:
return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
return self.label_asym_id
class ProteinComplex(BaseModel):
pdb_id: str
tcr_alpha: Optional[ProteinSequence] = None
tcr_beta: Optional[ProteinSequence] = None
mhc_alpha: Optional[ProteinSequence] = None
mhc_beta: Optional[ProteinSequence] = None
peptide: Optional[ProteinSequence] = None
mhc_class: str # MHC 类型1, 2, 或 '未知'
data_source: str # 数据来源,例如论文的 DOI 号
extra_data: Dict[str, Any] = {}
def add_extra_data(self, **kwargs):
self.extra_data.update(kwargs)
class FastaHeaderInfo(BaseModel):
pdb_id: str
chain_ids: List[str]
auth_chain_ids: Dict[str, str] # 映射 PDB 分配的链 ID 到作者分配的链 ID
description: str
@classmethod
def from_header_line(cls, header_line: str):
parts = header_line.split('|')
pdb_id = parts[0].strip('>')
chain_info = parts[1] if len(parts) > 1 else ''
description = parts[2] if len(parts) > 2 else ''
# 分割链信息以获取链 ID 和作者链 ID
chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
chain_ids = []
auth_chain_ids = {}
for part in chain_parts:
part = part.strip()
if '[' in part:
# 分离作者链 ID
chain_id, auth_chain_id = part.split('[')
chain_id = chain_id.strip()
auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip() # 移除 'auth ' 但保留关键信息
chain_ids.append(chain_id)
auth_chain_ids[chain_id] = auth_chain_id
else:
chain_ids.append(part)
auth_chain_ids[part] = part # 没有作者链 ID 时,使用相同的值
return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description)
class FastaSequence(BaseModel):
header_info: FastaHeaderInfo
sequence: BaseProteinSequence
class FastaFile(BaseModel):
file: FilePath
sequences: List[FastaSequence] = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.read_sequence()
@property
def sequence_num(self) -> int:
return len(self.sequences)
def read_sequence(self):
with open(self.file, 'r') as fasta_file:
header_line = ''
sequence = ''
for line in fasta_file:
line = line.strip()
if line.startswith('>'):
if sequence:
header_info = FastaHeaderInfo.from_header_line(header_line)
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
sequence = ''
header_line = line
else:
sequence += line
if sequence:
header_info = FastaHeaderInfo.from_header_line(header_line)
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))