update extract_sequences
This commit is contained in:
@@ -102,15 +102,18 @@ class PDBAnalyzer:
|
||||
biodf: PandasPdb = field(init=False)
|
||||
protein_state: str = field(init=False) # Apo or Holo
|
||||
chain_id_list: List[str] = field(init=False)
|
||||
log_file: Path = field(init=False)
|
||||
logger: logging.Logger = field(init=False)
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Initialize the PDB structure after the object is created.
|
||||
"""
|
||||
self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径
|
||||
self._configure_logging()
|
||||
self._initialize_structure()
|
||||
self._initialize_properties()
|
||||
self.check_and_log_sequence_issues() # 检查并记录编号问题
|
||||
|
||||
def _initialize_structure(self):
|
||||
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
|
||||
@@ -124,6 +127,7 @@ class PDBAnalyzer:
|
||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||
|
||||
|
||||
def _configure_logging(self):
|
||||
"""配置日志系统。"""
|
||||
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
|
||||
@@ -332,30 +336,38 @@ class PDBAnalyzer:
|
||||
# Convert standard residue names to single-letter codes
|
||||
return seq1(residue_name)
|
||||
|
||||
def extract_sequences(self, missing_char='-') -> Dict[str, str]:
|
||||
"""
|
||||
def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]:
|
||||
"""
|
||||
Extract amino acid sequences from the structure, with gaps denoted by the specified character.
|
||||
|
||||
Args:
|
||||
missing_char (str): Character to use for missing residues (default is '-').
|
||||
missing_char (str): Character to use for missing residues (default is '-').
|
||||
detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
|
||||
Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences
|
||||
or detailed sequence info depending on the detailed flag.
|
||||
"""
|
||||
# Create a continuity check function with the specified missing character
|
||||
check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
|
||||
|
||||
sequences = {}
|
||||
# Process each chain in the structure
|
||||
for model in self.protein_structure:
|
||||
chains = model.get_list()
|
||||
for chain in chains:
|
||||
# Check continuity and get the sequence of residues
|
||||
chain_sequence = check_continuity_new(chain)
|
||||
# Convert to single-letter sequence
|
||||
single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
|
||||
sequences[chain.id] = single_letter_sequence
|
||||
|
||||
for model in self.structure:
|
||||
for chain in model:
|
||||
chain_seq = []
|
||||
for residue in chain:
|
||||
if residue.id[0] == ' ' or residue.id[0] == 'H_MSE': # MSE is treated as MET in SEQRES
|
||||
res_id = residue.id[1]
|
||||
try:
|
||||
# Convert three-letter code to one-letter code, "-" for gaps
|
||||
res_letter = seq1(residue.resname, undef_code=missing_char)
|
||||
except KeyError:
|
||||
res_letter = missing_char
|
||||
if detailed:
|
||||
chain_seq.append((res_id, res_letter))
|
||||
else:
|
||||
chain_seq.append(res_letter)
|
||||
if detailed:
|
||||
sequences[chain.id] = chain_seq
|
||||
else:
|
||||
sequences[chain.id] = ''.join(chain_seq)
|
||||
return sequences
|
||||
|
||||
def extract_sequences_info(self) -> Dict[str, List[int]]:
|
||||
|
||||
Reference in New Issue
Block a user