diff --git a/.gitignore b/.gitignore index f4ffd73..1f11fce 100755 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ nohup.out pdb_test pdb_* .vscode -data.tar.gz \ No newline at end of file +*.tar.gz \ No newline at end of file diff --git a/tcr_pmhc_complexes.py b/tcr_pmhc_complexes.py index b085f59..4a484b9 100644 --- a/tcr_pmhc_complexes.py +++ b/tcr_pmhc_complexes.py @@ -19,27 +19,23 @@ AMINO_ACIDS = set(IUPACData.protein_letters) class BaseProteinSequence(BaseModel): sequence: str - @field_validator('sequence') - def validate_amino_acids(cls, v: str, values, **kwargs): - if not set(v).issubset(AMINO_ACIDS): + @classmethod + def validate_amino_acids(cls, sequence: str) -> str: + if not set(sequence).issubset(AMINO_ACIDS): raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards') - return v + return sequence class ProteinSequence(BaseModel): - label_asym_id: str # PDB 分配的链 ID - auth_asym_id: str # 作者选择的链 ID + label_asym_id: str + auth_asym_id: str sequence: str - is_id_consistent: bool = Field(default=False) # ID 是否一致 + is_id_consistent: bool = Field(default=False) @property def is_id_consistent(self) -> bool: - # 检查两个 ID 是否一致 return self.label_asym_id == self.auth_asym_id def display_chain_id(self) -> str: - """ - 显示链的 ID。如果 PDB ID 和作者 ID 不一致,则以 "label_asym_id [auth auth_asym_id]" 的格式显示。 - """ if not self.is_id_consistent: return f"{self.label_asym_id} [auth {self.auth_asym_id}]" return self.label_asym_id @@ -59,42 +55,57 @@ class ProteinComplex(BaseModel): self.extra_data.update(kwargs) class FastaHeaderInfo(BaseModel): - pdb_id: str - chain_ids: List[str] - auth_chain_ids: Dict[str, str] # 映射 PDB 分配的链 ID 到作者分配的链 ID - description: str + pdb_id: Optional[str] = None + chain_ids: List[str] = [] + auth_chain_ids: Dict[str, str] = {} + description: str = '' + is_polymeric: Optional[str] = None # 新增属性 @classmethod def from_header_line(cls, header_line: str): - parts = header_line.split('|') - pdb_id = parts[0].strip('>') - chain_info = parts[1] if len(parts) > 1 else '' - description = parts[2] if len(parts) > 2 else '' + if '|' in header_line: + parts = header_line.split('|') + pdb_id = parts[0].strip('>') + chain_info = parts[1] if len(parts) > 1 else '' + description = parts[2] if len(parts) > 2 else '' - # 分割链信息以获取链 ID 和作者链 ID - chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',') - chain_ids = [] - auth_chain_ids = {} + chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',') + chain_ids = [] + auth_chain_ids = {} - for part in chain_parts: - part = part.strip() - if '[' in part: - # 分离作者链 ID - chain_id, auth_chain_id = part.split('[') - chain_id = chain_id.strip() - auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip() # 移除 'auth ' 但保留关键信息 - chain_ids.append(chain_id) - auth_chain_ids[chain_id] = auth_chain_id - else: - chain_ids.append(part) - auth_chain_ids[part] = part # 没有作者链 ID 时,使用相同的值 + for part in chain_parts: + part = part.strip() + if '[' in part: + chain_id, auth_chain_id = part.split('[') + chain_id = chain_id.strip() + auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip() + chain_ids.append(chain_id) + auth_chain_ids[chain_id] = auth_chain_id + else: + chain_ids.append(part) + auth_chain_ids[part] = part + + is_polymeric = "Yes" if len(chain_ids) > 1 else "No" + else: + pdb_id = None + chain_ids = [] + auth_chain_ids = {} + description = header_line[1:].strip() # 去除开头的 '>' 字符 + is_polymeric = "Unknown" # 没有链信息时,多聚体状态未知 + + return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description, is_polymeric=is_polymeric) - return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description) class FastaSequence(BaseModel): header_info: FastaHeaderInfo sequence: BaseProteinSequence + @property + def sequence_length(self) -> int: + # 注意这里 sequence.sequence 是因为 sequence 是 BaseProteinSequence 的实例 + # 而 BaseProteinSequence 有一个名为 sequence 的字段 + return len(self.sequence.sequence) + class FastaFile(BaseModel): file: FilePath sequences: List[FastaSequence] = [] @@ -115,14 +126,19 @@ class FastaFile(BaseModel): line = line.strip() if line.startswith('>'): if sequence: + # 创建 BaseProteinSequence 实例 + base_sequence = BaseProteinSequence(sequence=sequence) header_info = FastaHeaderInfo.from_header_line(header_line) - self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence))) + # 使用 BaseProteinSequence 实例而不是字符串 + self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence)) sequence = '' header_line = line else: sequence += line + # 确保文件末尾的序列也被添加 if sequence: + base_sequence = BaseProteinSequence(sequence=sequence) header_info = FastaHeaderInfo.from_header_line(header_line) - self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence))) + self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence)) diff --git a/test.py b/test.py new file mode 100644 index 0000000..7a02120 --- /dev/null +++ b/test.py @@ -0,0 +1,43 @@ +from tcr_pmhc_complexes import FastaFile +from pathlib import Path + +def test_fasta_file(file_path): + fasta_file = FastaFile(file=Path(file_path)) + + print(f"文件:{file_path}") + print(f"序列数量:{fasta_file.sequence_num}\n") + + for seq in fasta_file.sequences: + header_info = seq.header_info + print(f"PDB ID: {header_info.pdb_id}") + print(f"Chain IDs: {', '.join(header_info.chain_ids)}") + print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()])}") + print(f"Description: {header_info.description}") + print(f"Sequence: {seq.sequence.sequence[:30]}...") # 打印部分序列以简化输出 + print(f"Sequence Length: {seq.sequence_length}\n") # 新增行来打印序列长度 + +def test_fasta_file1(file_path): + fasta_file = FastaFile(file=Path(file_path)) + + print(f"文件:{file_path}") + print(f"序列数量:{fasta_file.sequence_num}\n") + + for seq in fasta_file.sequences: + header_info = seq.header_info + print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}") + print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}") + print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}") + print(f"Is Polymeric: {header_info.is_polymeric}") + print(f"Description: {header_info.description}") + print(f"Sequence: {seq.sequence.sequence[:30]}...\n") + +# 测试三个 fasta 文件 +test_fasta_files = [ + '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gvb/8gvb.fasta', + '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gom/8gom.fasta', + '/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_7n6e/7n6e.fasta', + '/mnt/mydrive/analysis_pdb-dev/test.fasta' +] + +for file in test_fasta_files: + test_fasta_file(file)