update extract_sequences

This commit is contained in:
2024-03-07 14:30:49 +08:00
parent ced22ac37b
commit eca3f45bd9

View File

@@ -102,15 +102,18 @@ class PDBAnalyzer:
biodf: PandasPdb = field(init=False)
protein_state: str = field(init=False) # Apo or Holo
chain_id_list: List[str] = field(init=False)
log_file: Path = field(init=False)
logger: logging.Logger = field(init=False)
def __post_init__(self):
"""
Initialize the PDB structure after the object is created.
"""
self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径
self._configure_logging()
self._initialize_structure()
self._initialize_properties()
self.check_and_log_sequence_issues() # 检查并记录编号问题
def _initialize_structure(self):
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
@@ -124,6 +127,7 @@ class PDBAnalyzer:
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
def _configure_logging(self):
"""配置日志系统。"""
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
@@ -332,30 +336,38 @@ class PDBAnalyzer:
# Convert standard residue names to single-letter codes
return seq1(residue_name)
def extract_sequences(self, missing_char='-') -> Dict[str, str]:
"""
def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]:
"""
Extract amino acid sequences from the structure, with gaps denoted by the specified character.
Args:
missing_char (str): Character to use for missing residues (default is '-').
missing_char (str): Character to use for missing residues (default is '-').
detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}.
Returns:
Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences
or detailed sequence info depending on the detailed flag.
"""
# Create a continuity check function with the specified missing character
check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
sequences = {}
# Process each chain in the structure
for model in self.protein_structure:
chains = model.get_list()
for chain in chains:
# Check continuity and get the sequence of residues
chain_sequence = check_continuity_new(chain)
# Convert to single-letter sequence
single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
sequences[chain.id] = single_letter_sequence
for model in self.structure:
for chain in model:
chain_seq = []
for residue in chain:
if residue.id[0] == ' ' or residue.id[0] == 'H_MSE': # MSE is treated as MET in SEQRES
res_id = residue.id[1]
try:
# Convert three-letter code to one-letter code, "-" for gaps
res_letter = seq1(residue.resname, undef_code=missing_char)
except KeyError:
res_letter = missing_char
if detailed:
chain_seq.append((res_id, res_letter))
else:
chain_seq.append(res_letter)
if detailed:
sequences[chain.id] = chain_seq
else:
sequences[chain.id] = ''.join(chain_seq)
return sequences
def extract_sequences_info(self) -> Dict[str, List[int]]: