add fasta file handle;

2024-01-18 11:33:36 +08:00
parent dabb69c1af
commit cc702c8fec
1 changed files with 128 additions and 0 deletions
--- a/tcr_pmhc_complexes.py
+++ b/tcr_pmhc_complexes.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :tcr_pmhc_complexes.py
+@Description:       : 用于fasta文件构建数据集
+@Date               :2024/01/17 10:24:08
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+'''
+from Bio.Data import IUPACData
+from pydantic import BaseModel, Field, FilePath, field_validator
+from typing import Optional, Dict, List, Any, Union
+from pathlib import Path
+
+# 使用 BioPython 导入氨基酸缩写
+AMINO_ACIDS = set(IUPACData.protein_letters)
+
+class BaseProteinSequence(BaseModel):
+    sequence: str
+
+    @field_validator('sequence')
+    def validate_amino_acids(cls, v: str, values, **kwargs):
+        if not set(v).issubset(AMINO_ACIDS):
+            raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
+        return v
+
+class ProteinSequence(BaseModel):
+    label_asym_id: str  # PDB 分配的链 ID
+    auth_asym_id: str  # 作者选择的链 ID
+    sequence: str
+    is_id_consistent: bool = Field(default=False)  # ID 是否一致
+
+    @property
+    def is_id_consistent(self) -> bool:
+        # 检查两个 ID 是否一致
+        return self.label_asym_id == self.auth_asym_id
+
+    def display_chain_id(self) -> str:
+        """
+        显示链的 ID。如果 PDB ID 和作者 ID 不一致，则以 "label_asym_id [auth auth_asym_id]" 的格式显示。
+        """
+        if not self.is_id_consistent:
+            return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
+        return self.label_asym_id
+
+class ProteinComplex(BaseModel):
+    pdb_id: str
+    tcr_alpha: Optional[ProteinSequence] = None
+    tcr_beta: Optional[ProteinSequence] = None
+    mhc_alpha: Optional[ProteinSequence] = None
+    mhc_beta: Optional[ProteinSequence] = None
+    peptide: Optional[ProteinSequence] = None
+    mhc_class: str  # MHC 类型：1, 2, 或 '未知'
+    data_source: str  # 数据来源，例如论文的 DOI 号
+    extra_data: Dict[str, Any] = {}
+
+    def add_extra_data(self, **kwargs):
+        self.extra_data.update(kwargs)
+
+class FastaHeaderInfo(BaseModel):
+    pdb_id: str
+    chain_ids: List[str]
+    auth_chain_ids: Dict[str, str]  # 映射 PDB 分配的链 ID 到作者分配的链 ID
+    description: str
+
+    @classmethod
+    def from_header_line(cls, header_line: str):
+        parts = header_line.split('|')
+        pdb_id = parts[0].strip('>')
+        chain_info = parts[1] if len(parts) > 1 else ''
+        description = parts[2] if len(parts) > 2 else ''
+
+        # 分割链信息以获取链 ID 和作者链 ID
+        chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
+        chain_ids = []
+        auth_chain_ids = {}
+
+        for part in chain_parts:
+            part = part.strip()
+            if '[' in part:
+                # 分离作者链 ID
+                chain_id, auth_chain_id = part.split('[')
+                chain_id = chain_id.strip()
+                auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip()  # 移除 'auth ' 但保留关键信息
+                chain_ids.append(chain_id)
+                auth_chain_ids[chain_id] = auth_chain_id
+            else:
+                chain_ids.append(part)
+                auth_chain_ids[part] = part  # 没有作者链 ID 时，使用相同的值
+
+        return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description)
+
+class FastaSequence(BaseModel):
+    header_info: FastaHeaderInfo
+    sequence: BaseProteinSequence
+
+class FastaFile(BaseModel): 
+    file: FilePath
+    sequences: List[FastaSequence] = []
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.read_sequence()
+
+    @property
+    def sequence_num(self) -> int:
+        return len(self.sequences)
+
+    def read_sequence(self):
+        with open(self.file, 'r') as fasta_file:
+            header_line = ''
+            sequence = ''
+            for line in fasta_file:
+                line = line.strip()
+                if line.startswith('>'):
+                    if sequence:
+                        header_info = FastaHeaderInfo.from_header_line(header_line)
+                        self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
+                        sequence = ''
+                    header_line = line
+                else:
+                    sequence += line
+
+            if sequence:
+                header_info = FastaHeaderInfo.from_header_line(header_line)
+                self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
+