add fasta file handle;
This commit is contained in:
128
tcr_pmhc_complexes.py
Normal file
128
tcr_pmhc_complexes.py
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
@file :tcr_pmhc_complexes.py
|
||||||
|
@Description: : 用于fasta文件构建数据集
|
||||||
|
@Date :2024/01/17 10:24:08
|
||||||
|
@Author :lyzeng
|
||||||
|
@Email :pylyzeng@gmail.com
|
||||||
|
@version :1.0
|
||||||
|
'''
|
||||||
|
from Bio.Data import IUPACData
|
||||||
|
from pydantic import BaseModel, Field, FilePath, field_validator
|
||||||
|
from typing import Optional, Dict, List, Any, Union
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 使用 BioPython 导入氨基酸缩写
|
||||||
|
AMINO_ACIDS = set(IUPACData.protein_letters)
|
||||||
|
|
||||||
|
class BaseProteinSequence(BaseModel):
|
||||||
|
sequence: str
|
||||||
|
|
||||||
|
@field_validator('sequence')
|
||||||
|
def validate_amino_acids(cls, v: str, values, **kwargs):
|
||||||
|
if not set(v).issubset(AMINO_ACIDS):
|
||||||
|
raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
|
||||||
|
return v
|
||||||
|
|
||||||
|
class ProteinSequence(BaseModel):
|
||||||
|
label_asym_id: str # PDB 分配的链 ID
|
||||||
|
auth_asym_id: str # 作者选择的链 ID
|
||||||
|
sequence: str
|
||||||
|
is_id_consistent: bool = Field(default=False) # ID 是否一致
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_id_consistent(self) -> bool:
|
||||||
|
# 检查两个 ID 是否一致
|
||||||
|
return self.label_asym_id == self.auth_asym_id
|
||||||
|
|
||||||
|
def display_chain_id(self) -> str:
|
||||||
|
"""
|
||||||
|
显示链的 ID。如果 PDB ID 和作者 ID 不一致,则以 "label_asym_id [auth auth_asym_id]" 的格式显示。
|
||||||
|
"""
|
||||||
|
if not self.is_id_consistent:
|
||||||
|
return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
|
||||||
|
return self.label_asym_id
|
||||||
|
|
||||||
|
class ProteinComplex(BaseModel):
|
||||||
|
pdb_id: str
|
||||||
|
tcr_alpha: Optional[ProteinSequence] = None
|
||||||
|
tcr_beta: Optional[ProteinSequence] = None
|
||||||
|
mhc_alpha: Optional[ProteinSequence] = None
|
||||||
|
mhc_beta: Optional[ProteinSequence] = None
|
||||||
|
peptide: Optional[ProteinSequence] = None
|
||||||
|
mhc_class: str # MHC 类型:1, 2, 或 '未知'
|
||||||
|
data_source: str # 数据来源,例如论文的 DOI 号
|
||||||
|
extra_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
def add_extra_data(self, **kwargs):
|
||||||
|
self.extra_data.update(kwargs)
|
||||||
|
|
||||||
|
class FastaHeaderInfo(BaseModel):
|
||||||
|
pdb_id: str
|
||||||
|
chain_ids: List[str]
|
||||||
|
auth_chain_ids: Dict[str, str] # 映射 PDB 分配的链 ID 到作者分配的链 ID
|
||||||
|
description: str
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_header_line(cls, header_line: str):
|
||||||
|
parts = header_line.split('|')
|
||||||
|
pdb_id = parts[0].strip('>')
|
||||||
|
chain_info = parts[1] if len(parts) > 1 else ''
|
||||||
|
description = parts[2] if len(parts) > 2 else ''
|
||||||
|
|
||||||
|
# 分割链信息以获取链 ID 和作者链 ID
|
||||||
|
chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
|
||||||
|
chain_ids = []
|
||||||
|
auth_chain_ids = {}
|
||||||
|
|
||||||
|
for part in chain_parts:
|
||||||
|
part = part.strip()
|
||||||
|
if '[' in part:
|
||||||
|
# 分离作者链 ID
|
||||||
|
chain_id, auth_chain_id = part.split('[')
|
||||||
|
chain_id = chain_id.strip()
|
||||||
|
auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip() # 移除 'auth ' 但保留关键信息
|
||||||
|
chain_ids.append(chain_id)
|
||||||
|
auth_chain_ids[chain_id] = auth_chain_id
|
||||||
|
else:
|
||||||
|
chain_ids.append(part)
|
||||||
|
auth_chain_ids[part] = part # 没有作者链 ID 时,使用相同的值
|
||||||
|
|
||||||
|
return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description)
|
||||||
|
|
||||||
|
class FastaSequence(BaseModel):
|
||||||
|
header_info: FastaHeaderInfo
|
||||||
|
sequence: BaseProteinSequence
|
||||||
|
|
||||||
|
class FastaFile(BaseModel):
|
||||||
|
file: FilePath
|
||||||
|
sequences: List[FastaSequence] = []
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.read_sequence()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sequence_num(self) -> int:
|
||||||
|
return len(self.sequences)
|
||||||
|
|
||||||
|
def read_sequence(self):
|
||||||
|
with open(self.file, 'r') as fasta_file:
|
||||||
|
header_line = ''
|
||||||
|
sequence = ''
|
||||||
|
for line in fasta_file:
|
||||||
|
line = line.strip()
|
||||||
|
if line.startswith('>'):
|
||||||
|
if sequence:
|
||||||
|
header_info = FastaHeaderInfo.from_header_line(header_line)
|
||||||
|
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
|
||||||
|
sequence = ''
|
||||||
|
header_line = line
|
||||||
|
else:
|
||||||
|
sequence += line
|
||||||
|
|
||||||
|
if sequence:
|
||||||
|
header_info = FastaHeaderInfo.from_header_line(header_line)
|
||||||
|
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
|
||||||
|
|
||||||
Reference in New Issue
Block a user