Files
analysis_pdb/tcr_pmhc_complexes.py
2024-01-18 14:44:07 +08:00

189 lines
7.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :tcr_pmhc_complexes.py
@Description: : 用于fasta文件构建数据集
@Date :2024/01/17 10:24:08
@Author :lyzeng
@Email :pylyzeng@gmail.com
@version :1.0
'''
from Bio.Data import IUPACData
from pydantic import BaseModel, Field, FilePath, field_validator
from typing import Optional, Dict, List, Any, Union
from pathlib import Path
# 使用 BioPython 导入氨基酸缩写
AMINO_ACIDS = set(IUPACData.protein_letters)
class BaseProteinSequence(BaseModel):
sequence: str
@classmethod
def validate_amino_acids(cls, sequence: str) -> str:
if not set(sequence).issubset(AMINO_ACIDS):
raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
return sequence
class ProteinSequence(BaseModel):
label_asym_id: str
auth_asym_id: str
sequence: str
is_id_consistent: bool = Field(default=False)
@property
def is_id_consistent(self) -> bool:
return self.label_asym_id == self.auth_asym_id
def display_chain_id(self) -> str:
if not self.is_id_consistent:
return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
return self.label_asym_id
class ProteinComplex(BaseModel):
pdb_id: str
tcr_alpha: Optional[ProteinSequence] = None
tcr_beta: Optional[ProteinSequence] = None
mhc_alpha: Optional[ProteinSequence] = None
mhc_beta: Optional[ProteinSequence] = None
peptide: Optional[ProteinSequence] = None
mhc_class: str # MHC 类型1, 2, 或 '未知'
data_source: str # 数据来源,例如论文的 DOI 号
extra_data: Dict[str, Any] = {}
def add_extra_data(self, **kwargs):
self.extra_data.update(kwargs)
class FastaHeaderInfo(BaseModel):
pdb_id: Optional[str] = None
chain_ids: List[str] = []
auth_chain_ids: Dict[str, str] = {}
description: str = ''
is_polymeric: Optional[str] = None # 新增属性
@classmethod
def from_header_line(cls, header_line: str):
if '|' in header_line:
parts = header_line.split('|')
pdb_id = parts[0].strip('>')
chain_info = parts[1] if len(parts) > 1 else ''
description = parts[2] if len(parts) > 2 else ''
chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
chain_ids = []
auth_chain_ids = {}
for part in chain_parts:
part = part.strip()
if '[' in part:
chain_id, auth_chain_id = part.split('[')
chain_id = chain_id.strip()
auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip()
chain_ids.append(chain_id)
auth_chain_ids[chain_id] = auth_chain_id
else:
chain_ids.append(part)
auth_chain_ids[part] = part
is_polymeric = "Yes" if len(chain_ids) > 1 else "No"
else:
pdb_id = None
chain_ids = []
auth_chain_ids = {}
description = header_line[1:].strip() # 去除开头的 '>' 字符
is_polymeric = "Unknown" # 没有链信息时,多聚体状态未知
return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description, is_polymeric=is_polymeric)
# 对于标准fasta的兼容尝试处理废弃
# class FastaHeaderInfo(BaseModel):
# info: Dict[str, Union[str, List[str], Dict[str, str]]] = {}
# is_polymeric: Optional[str] = None # 新增属性
# @classmethod
# def from_header_line(cls, header_line: str):
# header_info = {}
# if '|' in header_line and 'Chain' in header_line:
# # PDB FASTA格式处理
# parts = header_line.split('|')
# header_info['identifier'] = parts[0].strip('>')
# chain_info = parts[1] if len(parts) > 1 else ''
# header_info['description'] = parts[2] if len(parts) > 2 else ''
# chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
# chain_ids = []
# auth_chain_ids = {}
# for part in chain_parts:
# part = part.strip()
# if '[' in part:
# chain_id, auth_chain_id = part.split('[')
# chain_id = chain_id.strip()
# auth_chain_id = auth_chain_id.strip(']').strip()
# chain_ids.append(chain_id)
# auth_chain_ids[chain_id] = auth_chain_id
# else:
# chain_ids.append(part)
# auth_chain_ids[part] = part
# header_info['chain_ids'] = chain_ids
# header_info['auth_chain_ids'] = auth_chain_ids
# header_info['is_polymeric'] = "Yes" if len(chain_ids) > 1 else "No"
# else:
# # 处理非PDB或非标准FASTA头信息
# identifier = header_line[1:].split()[0] # 取第一个空格前的文本作为标识符
# description = ' '.join(header_line[1:].split()[1:]) # 剩余的文本作为描述
# header_info['identifier'] = identifier
# header_info['description'] = description
# header_info['is_polymeric'] = "Unknown"
# return cls(info=header_info)
class FastaSequence(BaseModel):
header_info: FastaHeaderInfo
sequence: BaseProteinSequence
@property
def sequence_length(self) -> int:
# 注意这里 sequence.sequence 是因为 sequence 是 BaseProteinSequence 的实例
# 而 BaseProteinSequence 有一个名为 sequence 的字段
return len(self.sequence.sequence)
class FastaFile(BaseModel):
file: FilePath
sequences: List[FastaSequence] = []
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.read_sequence()
@property
def sequence_num(self) -> int:
return len(self.sequences)
def read_sequence(self):
with open(self.file, 'r') as fasta_file:
header_line = ''
sequence = ''
for line in fasta_file:
line = line.strip()
if line.startswith('>'):
if sequence:
# 创建 BaseProteinSequence 实例
base_sequence = BaseProteinSequence(sequence=sequence)
header_info = FastaHeaderInfo.from_header_line(header_line)
# 使用 BaseProteinSequence 实例而不是字符串
self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))
sequence = ''
header_line = line
else:
sequence += line
# 确保文件末尾的序列也被添加
if sequence:
base_sequence = BaseProteinSequence(sequence=sequence)
header_info = FastaHeaderInfo.from_header_line(header_line)
self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))