diff --git a/analysis_pdb.py b/analysis_pdb.py index 74e7599..382dbe6 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -102,15 +102,18 @@ class PDBAnalyzer: biodf: PandasPdb = field(init=False) protein_state: str = field(init=False) # Apo or Holo chain_id_list: List[str] = field(init=False) + log_file: Path = field(init=False) logger: logging.Logger = field(init=False) def __post_init__(self): """ Initialize the PDB structure after the object is created. """ + self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径 self._configure_logging() self._initialize_structure() self._initialize_properties() + self.check_and_log_sequence_issues() # 检查并记录编号问题 def _initialize_structure(self): self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix()) @@ -124,6 +127,7 @@ class PDBAnalyzer: self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() + def _configure_logging(self): """配置日志系统。""" self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}") @@ -332,30 +336,38 @@ class PDBAnalyzer: # Convert standard residue names to single-letter codes return seq1(residue_name) - def extract_sequences(self, missing_char='-') -> Dict[str, str]: - """ + def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]: + """ Extract amino acid sequences from the structure, with gaps denoted by the specified character. Args: - missing_char (str): Character to use for missing residues (default is '-'). + missing_char (str): Character to use for missing residues (default is '-'). + detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}. Returns: - Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences. + Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences + or detailed sequence info depending on the detailed flag. """ - # Create a continuity check function with the specified missing character - check_continuity_new = partial(self.check_continuity, missing_char=missing_char) - sequences = {} - # Process each chain in the structure - for model in self.protein_structure: - chains = model.get_list() - for chain in chains: - # Check continuity and get the sequence of residues - chain_sequence = check_continuity_new(chain) - # Convert to single-letter sequence - single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence) - sequences[chain.id] = single_letter_sequence - + for model in self.structure: + for chain in model: + chain_seq = [] + for residue in chain: + if residue.id[0] == ' ' or residue.id[0] == 'H_MSE': # MSE is treated as MET in SEQRES + res_id = residue.id[1] + try: + # Convert three-letter code to one-letter code, "-" for gaps + res_letter = seq1(residue.resname, undef_code=missing_char) + except KeyError: + res_letter = missing_char + if detailed: + chain_seq.append((res_id, res_letter)) + else: + chain_seq.append(res_letter) + if detailed: + sequences[chain.id] = chain_seq + else: + sequences[chain.id] = ''.join(chain_seq) return sequences def extract_sequences_info(self) -> Dict[str, List[int]]: