update
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -19,4 +19,4 @@ nohup.out
|
|||||||
pdb_test
|
pdb_test
|
||||||
pdb_*
|
pdb_*
|
||||||
.vscode
|
.vscode
|
||||||
data.tar.gz
|
*.tar.gz
|
||||||
@@ -19,27 +19,23 @@ AMINO_ACIDS = set(IUPACData.protein_letters)
|
|||||||
class BaseProteinSequence(BaseModel):
|
class BaseProteinSequence(BaseModel):
|
||||||
sequence: str
|
sequence: str
|
||||||
|
|
||||||
@field_validator('sequence')
|
@classmethod
|
||||||
def validate_amino_acids(cls, v: str, values, **kwargs):
|
def validate_amino_acids(cls, sequence: str) -> str:
|
||||||
if not set(v).issubset(AMINO_ACIDS):
|
if not set(sequence).issubset(AMINO_ACIDS):
|
||||||
raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
|
raise ValueError('Sequence contains invalid amino acids, not conforming to IUPAC standards')
|
||||||
return v
|
return sequence
|
||||||
|
|
||||||
class ProteinSequence(BaseModel):
|
class ProteinSequence(BaseModel):
|
||||||
label_asym_id: str # PDB 分配的链 ID
|
label_asym_id: str
|
||||||
auth_asym_id: str # 作者选择的链 ID
|
auth_asym_id: str
|
||||||
sequence: str
|
sequence: str
|
||||||
is_id_consistent: bool = Field(default=False) # ID 是否一致
|
is_id_consistent: bool = Field(default=False)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_id_consistent(self) -> bool:
|
def is_id_consistent(self) -> bool:
|
||||||
# 检查两个 ID 是否一致
|
|
||||||
return self.label_asym_id == self.auth_asym_id
|
return self.label_asym_id == self.auth_asym_id
|
||||||
|
|
||||||
def display_chain_id(self) -> str:
|
def display_chain_id(self) -> str:
|
||||||
"""
|
|
||||||
显示链的 ID。如果 PDB ID 和作者 ID 不一致,则以 "label_asym_id [auth auth_asym_id]" 的格式显示。
|
|
||||||
"""
|
|
||||||
if not self.is_id_consistent:
|
if not self.is_id_consistent:
|
||||||
return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
|
return f"{self.label_asym_id} [auth {self.auth_asym_id}]"
|
||||||
return self.label_asym_id
|
return self.label_asym_id
|
||||||
@@ -59,42 +55,57 @@ class ProteinComplex(BaseModel):
|
|||||||
self.extra_data.update(kwargs)
|
self.extra_data.update(kwargs)
|
||||||
|
|
||||||
class FastaHeaderInfo(BaseModel):
|
class FastaHeaderInfo(BaseModel):
|
||||||
pdb_id: str
|
pdb_id: Optional[str] = None
|
||||||
chain_ids: List[str]
|
chain_ids: List[str] = []
|
||||||
auth_chain_ids: Dict[str, str] # 映射 PDB 分配的链 ID 到作者分配的链 ID
|
auth_chain_ids: Dict[str, str] = {}
|
||||||
description: str
|
description: str = ''
|
||||||
|
is_polymeric: Optional[str] = None # 新增属性
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_header_line(cls, header_line: str):
|
def from_header_line(cls, header_line: str):
|
||||||
parts = header_line.split('|')
|
if '|' in header_line:
|
||||||
pdb_id = parts[0].strip('>')
|
parts = header_line.split('|')
|
||||||
chain_info = parts[1] if len(parts) > 1 else ''
|
pdb_id = parts[0].strip('>')
|
||||||
description = parts[2] if len(parts) > 2 else ''
|
chain_info = parts[1] if len(parts) > 1 else ''
|
||||||
|
description = parts[2] if len(parts) > 2 else ''
|
||||||
|
|
||||||
# 分割链信息以获取链 ID 和作者链 ID
|
chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
|
||||||
chain_parts = chain_info.replace('Chain ', '').replace('Chains ', '').split(',')
|
chain_ids = []
|
||||||
chain_ids = []
|
auth_chain_ids = {}
|
||||||
auth_chain_ids = {}
|
|
||||||
|
|
||||||
for part in chain_parts:
|
for part in chain_parts:
|
||||||
part = part.strip()
|
part = part.strip()
|
||||||
if '[' in part:
|
if '[' in part:
|
||||||
# 分离作者链 ID
|
chain_id, auth_chain_id = part.split('[')
|
||||||
chain_id, auth_chain_id = part.split('[')
|
chain_id = chain_id.strip()
|
||||||
chain_id = chain_id.strip()
|
auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip()
|
||||||
auth_chain_id = auth_chain_id.strip(']').replace('auth ', '').strip() # 移除 'auth ' 但保留关键信息
|
chain_ids.append(chain_id)
|
||||||
chain_ids.append(chain_id)
|
auth_chain_ids[chain_id] = auth_chain_id
|
||||||
auth_chain_ids[chain_id] = auth_chain_id
|
else:
|
||||||
else:
|
chain_ids.append(part)
|
||||||
chain_ids.append(part)
|
auth_chain_ids[part] = part
|
||||||
auth_chain_ids[part] = part # 没有作者链 ID 时,使用相同的值
|
|
||||||
|
is_polymeric = "Yes" if len(chain_ids) > 1 else "No"
|
||||||
|
else:
|
||||||
|
pdb_id = None
|
||||||
|
chain_ids = []
|
||||||
|
auth_chain_ids = {}
|
||||||
|
description = header_line[1:].strip() # 去除开头的 '>' 字符
|
||||||
|
is_polymeric = "Unknown" # 没有链信息时,多聚体状态未知
|
||||||
|
|
||||||
|
return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description, is_polymeric=is_polymeric)
|
||||||
|
|
||||||
return cls(pdb_id=pdb_id, chain_ids=chain_ids, auth_chain_ids=auth_chain_ids, description=description)
|
|
||||||
|
|
||||||
class FastaSequence(BaseModel):
|
class FastaSequence(BaseModel):
|
||||||
header_info: FastaHeaderInfo
|
header_info: FastaHeaderInfo
|
||||||
sequence: BaseProteinSequence
|
sequence: BaseProteinSequence
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sequence_length(self) -> int:
|
||||||
|
# 注意这里 sequence.sequence 是因为 sequence 是 BaseProteinSequence 的实例
|
||||||
|
# 而 BaseProteinSequence 有一个名为 sequence 的字段
|
||||||
|
return len(self.sequence.sequence)
|
||||||
|
|
||||||
class FastaFile(BaseModel):
|
class FastaFile(BaseModel):
|
||||||
file: FilePath
|
file: FilePath
|
||||||
sequences: List[FastaSequence] = []
|
sequences: List[FastaSequence] = []
|
||||||
@@ -115,14 +126,19 @@ class FastaFile(BaseModel):
|
|||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line.startswith('>'):
|
if line.startswith('>'):
|
||||||
if sequence:
|
if sequence:
|
||||||
|
# 创建 BaseProteinSequence 实例
|
||||||
|
base_sequence = BaseProteinSequence(sequence=sequence)
|
||||||
header_info = FastaHeaderInfo.from_header_line(header_line)
|
header_info = FastaHeaderInfo.from_header_line(header_line)
|
||||||
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
|
# 使用 BaseProteinSequence 实例而不是字符串
|
||||||
|
self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))
|
||||||
sequence = ''
|
sequence = ''
|
||||||
header_line = line
|
header_line = line
|
||||||
else:
|
else:
|
||||||
sequence += line
|
sequence += line
|
||||||
|
|
||||||
|
# 确保文件末尾的序列也被添加
|
||||||
if sequence:
|
if sequence:
|
||||||
|
base_sequence = BaseProteinSequence(sequence=sequence)
|
||||||
header_info = FastaHeaderInfo.from_header_line(header_line)
|
header_info = FastaHeaderInfo.from_header_line(header_line)
|
||||||
self.sequences.append(FastaSequence(header_info=header_info, sequence=BaseProteinSequence(sequence=sequence)))
|
self.sequences.append(FastaSequence(header_info=header_info, sequence=base_sequence))
|
||||||
|
|
||||||
|
|||||||
43
test.py
Normal file
43
test.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from tcr_pmhc_complexes import FastaFile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def test_fasta_file(file_path):
|
||||||
|
fasta_file = FastaFile(file=Path(file_path))
|
||||||
|
|
||||||
|
print(f"文件:{file_path}")
|
||||||
|
print(f"序列数量:{fasta_file.sequence_num}\n")
|
||||||
|
|
||||||
|
for seq in fasta_file.sequences:
|
||||||
|
header_info = seq.header_info
|
||||||
|
print(f"PDB ID: {header_info.pdb_id}")
|
||||||
|
print(f"Chain IDs: {', '.join(header_info.chain_ids)}")
|
||||||
|
print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()])}")
|
||||||
|
print(f"Description: {header_info.description}")
|
||||||
|
print(f"Sequence: {seq.sequence.sequence[:30]}...") # 打印部分序列以简化输出
|
||||||
|
print(f"Sequence Length: {seq.sequence_length}\n") # 新增行来打印序列长度
|
||||||
|
|
||||||
|
def test_fasta_file1(file_path):
|
||||||
|
fasta_file = FastaFile(file=Path(file_path))
|
||||||
|
|
||||||
|
print(f"文件:{file_path}")
|
||||||
|
print(f"序列数量:{fasta_file.sequence_num}\n")
|
||||||
|
|
||||||
|
for seq in fasta_file.sequences:
|
||||||
|
header_info = seq.header_info
|
||||||
|
print(f"PDB ID: {header_info.pdb_id if header_info.pdb_id else 'N/A'}")
|
||||||
|
print(f"Chain IDs: {', '.join(header_info.chain_ids) if header_info.chain_ids else 'N/A'}")
|
||||||
|
print(f"Author Chain IDs: {', '.join([f'{cid} ({aid})' for cid, aid in header_info.auth_chain_ids.items()]) if header_info.auth_chain_ids else 'N/A'}")
|
||||||
|
print(f"Is Polymeric: {header_info.is_polymeric}")
|
||||||
|
print(f"Description: {header_info.description}")
|
||||||
|
print(f"Sequence: {seq.sequence.sequence[:30]}...\n")
|
||||||
|
|
||||||
|
# 测试三个 fasta 文件
|
||||||
|
test_fasta_files = [
|
||||||
|
'/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gvb/8gvb.fasta',
|
||||||
|
'/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_8gom/8gom.fasta',
|
||||||
|
'/mnt/mydrive/analysis_pdb-dev/pdb_test1/runner_7n6e/7n6e.fasta',
|
||||||
|
'/mnt/mydrive/analysis_pdb-dev/test.fasta'
|
||||||
|
]
|
||||||
|
|
||||||
|
for file in test_fasta_files:
|
||||||
|
test_fasta_file(file)
|
||||||
Reference in New Issue
Block a user