update extract_sequences

This commit is contained in:
2024-03-07 14:30:49 +08:00
parent ced22ac37b
commit eca3f45bd9

View File

@@ -102,15 +102,18 @@ class PDBAnalyzer:
biodf: PandasPdb = field(init=False) biodf: PandasPdb = field(init=False)
protein_state: str = field(init=False) # Apo or Holo protein_state: str = field(init=False) # Apo or Holo
chain_id_list: List[str] = field(init=False) chain_id_list: List[str] = field(init=False)
log_file: Path = field(init=False)
logger: logging.Logger = field(init=False) logger: logging.Logger = field(init=False)
def __post_init__(self): def __post_init__(self):
""" """
Initialize the PDB structure after the object is created. Initialize the PDB structure after the object is created.
""" """
self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径
self._configure_logging() self._configure_logging()
self._initialize_structure() self._initialize_structure()
self._initialize_properties() self._initialize_properties()
self.check_and_log_sequence_issues() # 检查并记录编号问题
def _initialize_structure(self): def _initialize_structure(self):
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix()) self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
@@ -124,6 +127,7 @@ class PDBAnalyzer:
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
def _configure_logging(self): def _configure_logging(self):
"""配置日志系统。""" """配置日志系统。"""
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}") self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
@@ -332,30 +336,38 @@ class PDBAnalyzer:
# Convert standard residue names to single-letter codes # Convert standard residue names to single-letter codes
return seq1(residue_name) return seq1(residue_name)
def extract_sequences(self, missing_char='-') -> Dict[str, str]: def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]:
""" """
Extract amino acid sequences from the structure, with gaps denoted by the specified character. Extract amino acid sequences from the structure, with gaps denoted by the specified character.
Args: Args:
missing_char (str): Character to use for missing residues (default is '-'). missing_char (str): Character to use for missing residues (default is '-').
detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}.
Returns: Returns:
Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences. Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences
or detailed sequence info depending on the detailed flag.
""" """
# Create a continuity check function with the specified missing character
check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
sequences = {} sequences = {}
# Process each chain in the structure for model in self.structure:
for model in self.protein_structure: for chain in model:
chains = model.get_list() chain_seq = []
for chain in chains: for residue in chain:
# Check continuity and get the sequence of residues if residue.id[0] == ' ' or residue.id[0] == 'H_MSE': # MSE is treated as MET in SEQRES
chain_sequence = check_continuity_new(chain) res_id = residue.id[1]
# Convert to single-letter sequence try:
single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence) # Convert three-letter code to one-letter code, "-" for gaps
sequences[chain.id] = single_letter_sequence res_letter = seq1(residue.resname, undef_code=missing_char)
except KeyError:
res_letter = missing_char
if detailed:
chain_seq.append((res_id, res_letter))
else:
chain_seq.append(res_letter)
if detailed:
sequences[chain.id] = chain_seq
else:
sequences[chain.id] = ''.join(chain_seq)
return sequences return sequences
def extract_sequences_info(self) -> Dict[str, List[int]]: def extract_sequences_info(self) -> Dict[str, List[int]]: