diff --git a/sequence_base.py b/sequence_base.py new file mode 100644 index 0000000..229509b --- /dev/null +++ b/sequence_base.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- +''' +@file :sequence_base.py +@Description: : 序列信息基类 +@Date :2024/01/19 10:17:41 +@Author :lyzeng +@Email :pylyzeng@gmail.com +@version :1.0 +''' +from Bio.Data import IUPACData +from pydantic import BaseModel, Field, FilePath, field_validator + + +# 使用 BioPython 导入氨基酸缩写 +AMINO_ACIDS = set(IUPACData.protein_letters) + +class ProteinSequence(BaseModel): + label_asym_id: str + auth_asym_id: str + sequence: str + is_id_consistent: bool = Field(default=False) + + @property + def is_id_consistent(self) -> bool: + return self.label_asym_id == self.auth_asym_id + + def display_chain_id(self) -> str: + if not self.is_id_consistent: + return f"{self.label_asym_id} [auth {self.auth_asym_id}]" + return self.label_asym_id + +class BaseProteinSequence(BaseModel): + sequence: str + + @classmethod + def validate_amino_acids(cls, sequence: str) -> str: + if not set(sequence).issubset(AMINO_ACIDS): + raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards') + return sequence \ No newline at end of file