update

2024-01-18 14:15:02 +08:00
parent e72ca52bcd
commit 5ad2d91ed3
3 changed files with 98 additions and 39 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,4 @@ nohup.out
 pdb_test
 pdb_*
 .vscode
-data.tar.gz
+*.tar.gz
--- a/tcr_pmhc_complexes.py
+++ b/tcr_pmhc_complexes.py
@@ -19,27 +19,23 @@ AMINO_ACIDS = set(IUPACData.protein_letters)
 class BaseProteinSequence(BaseModel):
    sequence: str

-    @field_validator('sequence')
-    def validate_amino_acids(cls, v: str, values, **kwargs):
-        if not set(v).issubset(AMINO_ACIDS):
+    @classmethod
+    def validate_amino_acids(cls, sequence: str) -> str:
+        if not set(sequence).issubset(AMINO_ACIDS):
            raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
-        return v
+        return sequence

 class ProteinSequence(BaseModel):
-    label_asym_id: str  # PDB 分配的链 ID
-    auth_asym_id: str  # 作者选择的链 ID
+    label_asym_id: str
+    auth_asym_id: str
    sequence: str
-    is_id_consistent: bool = Field(default=False)  # ID 是否一致
+    is_id_consistent: bool = Field(default=False)

    @property
    def is_id_consistent(self) -> bool:
-        # 检查两个 ID 是否一致
        return self.label_asym_id == self.auth_asym_id

    def display_chain_id(self) -> str:
-        """
-        显示链的 ID。如果 PDB ID 和作者 ID 不一致，则以 "label_asym_id [auth auth_asym_id]" 的格式显示。
-        """
        if not self.is_id_consistent:
            return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
        return self.label_asym_id
@@ -59,42 +55,57 @@ class ProteinComplex(BaseModel):
        self.extra_data.update(kwargs)

 class FastaHeaderInfo(BaseModel):
-    pdb_id: str
-    chain_ids: List[str]
-    auth_chain_ids: Dict[str, str]  # 映射 PDB 分配的链 ID 到作者分配的链 ID
-    description: str
+    pdb_id: Optional[str] = None
+    chain_ids: List[str] = []
+    auth_chain_ids: Dict[str, str] = {}
+    description: str = ''
+    is_polymeric: Optional[str] = None  # 新增属性

    @classmethod
    def from_header_line(cls, header_line: str):
-        parts = header_line.split('|')
-        pdb_id = parts[0].strip('>')
-        chain_info = parts[1] if len(parts) > 1 else ''
-        description = parts[2] if len(parts) > 2 else ''
+        if '|' in header_line:
+            parts = header_line.split('|')
+            pdb_id = parts[0].strip('>')
+            chain_info = parts[1] if len(parts) > 1 else ''
+            description = parts[2] if len(parts) > 2 else ''

-        # 分割链信息以获取链 ID 和作者链 ID
-        chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
-        chain_ids = []
-        auth_chain_ids = {}
+            chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
+            chain_ids = []
+            auth_chain_ids = {}

-        for part in chain_parts:
-            part = part.strip()
-            if '[' in part:
-                # 分离作者链 ID
-                chain_id, auth_chain_id = part.split('[')
-                chain_id = chain_id.strip()
-                auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip()  # 移除 'auth ' 但保留关键信息
-                chain_ids.append(chain_id)
-                auth_chain_ids[chain_id] = auth_chain_id
-            else:
-                chain_ids.append(part)
-                auth_chain_ids[part] = part  # 没有作者链 ID 时，使用相同的值
+            for part in chain_parts:
+                part = part.strip()
+                if '[' in part:
+                    chain_id, auth_chain_id = part.split('[')
+                    chain_id = chain_id.strip()
+                    auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip()
+                    chain_ids.append(chain_id)
+                    auth_chain_ids[chain_id] = auth_chain_id
+                else:
+                    chain_ids.append(part)
+                    auth_chain_ids[part] = part
+
+            is_polymeric = "Yes" if len(chain_ids) > 1 else "No"
+        else:
+            pdb_id = None
+            chain_ids = []
+            auth_chain_ids = {}
+            description = header_line[1:].strip()  # 去除开头的 '>' 字符
+            is_polymeric = "Unknown"  # 没有链信息时，多聚体状态未知
+
+        return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description, is_polymeric=is_polymeric)

-        return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description)

 class FastaSequence(BaseModel):
    header_info: FastaHeaderInfo
    sequence: BaseProteinSequence

+    @property
+    def sequence_length(self) -> int:
+        # 注意这里 sequence.sequence 是因为 sequence 是 BaseProteinSequence 的实例
+        # 而 BaseProteinSequence 有一个名为 sequence 的字段
+        return len(self.sequence.sequence)
+
 class FastaFile(BaseModel): 
    file: FilePath
    sequences: List[FastaSequence] = []
@@ -115,14 +126,19 @@ class FastaFile(BaseModel):
                line = line.strip()
                if line.startswith('>'):
                    if sequence:
+                        # 创建 BaseProteinSequence 实例
+                        base_sequence = BaseProteinSequence(sequence=sequence)
                        header_info = FastaHeaderInfo.from_header_line(header_line)
-                        self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
+                        # 使用 BaseProteinSequence 实例而不是字符串
+                        self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))
                        sequence = ''
                    header_line = line
                else:
                    sequence += line

+            # 确保文件末尾的序列也被添加
            if sequence:
+                base_sequence = BaseProteinSequence(sequence=sequence)
                header_info = FastaHeaderInfo.from_header_line(header_line)
-                self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
+                self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))

--- a/test.py
+++ b/test.py
@@ -0,0 +1,43 @@
+from tcr_pmhc_complexes import FastaFile
+from pathlib import Path
+
+def test_fasta_file(file_path):
+    fasta_file = FastaFile(file=Path(file_path))
+
+    print(f"文件：{file_path}")
+    print(f"序列数量：{fasta_file.sequence_num}\n")
+
+    for seq in fasta_file.sequences:
+        header_info = seq.header_info
+        print(f"PDB ID: {header_info.pdb_id}")
+        print(f"Chain IDs: {', '.join(header_info.chain_ids)}")
+        print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()])}")
+        print(f"Description: {header_info.description}")
+        print(f"Sequence: {seq.sequence.sequence[:30]}...")  # 打印部分序列以简化输出
+        print(f"Sequence Length: {seq.sequence_length}\n")  # 新增行来打印序列长度
+
+def test_fasta_file1(file_path):
+    fasta_file = FastaFile(file=Path(file_path))
+
+    print(f"文件：{file_path}")
+    print(f"序列数量：{fasta_file.sequence_num}\n")
+
+    for seq in fasta_file.sequences:
+        header_info = seq.header_info
+        print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}")
+        print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}")
+        print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}")
+        print(f"Is Polymeric: {header_info.is_polymeric}")
+        print(f"Description: {header_info.description}")
+        print(f"Sequence: {seq.sequence.sequence[:30]}...\n")
+
+# 测试三个 fasta 文件
+test_fasta_files = [
+    '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gvb/8gvb.fasta',
+    '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gom/8gom.fasta',
+    '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_7n6e/7n6e.fasta',
+    '/mnt/mydrive/analysis_pdb-dev/test.fasta'
+]
+
+for file in test_fasta_files:
+    test_fasta_file(file)