update extract_sequences

2024-03-07 14:30:49 +08:00
parent ced22ac37b
commit eca3f45bd9
1 changed files with 29 additions and 17 deletions
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -102,15 +102,18 @@ class PDBAnalyzer:
    biodf: PandasPdb = field(init=False)
    protein_state: str = field(init=False) # Apo or Holo
    chain_id_list: List[str] = field(init=False)
+    log_file: Path = field(init=False)
    logger: logging.Logger = field(init=False)

    def __post_init__(self):
        """ 
        Initialize the PDB structure after the object is created.
        """
+        self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径
        self._configure_logging()
        self._initialize_structure()
        self._initialize_properties()
+        self.check_and_log_sequence_issues() # 检查并记录编号问题
        
    def _initialize_structure(self):
        self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
@@ -124,6 +127,7 @@ class PDBAnalyzer:
        self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
        self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
        
+        
    def _configure_logging(self):
        """配置日志系统。"""
        self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
@@ -332,30 +336,38 @@ class PDBAnalyzer:
        # Convert standard residue names to single-letter codes
        return seq1(residue_name)

-    def extract_sequences(self, missing_char='-') -> Dict[str, str]:
-        """ 
+    def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]:
+        """
        Extract amino acid sequences from the structure, with gaps denoted by the specified character.

        Args:
-        missing_char (str): Character to use for missing residues (default is '-').
+            missing_char (str): Character to use for missing residues (default is '-').
+            detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}.

        Returns:
-        Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
+            Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences
+            or detailed sequence info depending on the detailed flag.
        """
-        # Create a continuity check function with the specified missing character
-        check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
-
        sequences = {}
-        # Process each chain in the structure
-        for model in self.protein_structure:
-            chains = model.get_list()
-            for chain in chains:
-                # Check continuity and get the sequence of residues
-                chain_sequence = check_continuity_new(chain)
-                # Convert to single-letter sequence
-                single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
-                sequences[chain.id] = single_letter_sequence
-
+        for model in self.structure:
+            for chain in model:
+                chain_seq = []
+                for residue in chain:
+                    if residue.id[0] == ' ' or residue.id[0] == 'H_MSE':  # MSE is treated as MET in SEQRES
+                        res_id = residue.id[1]
+                        try:
+                            # Convert three-letter code to one-letter code, "-" for gaps
+                            res_letter = seq1(residue.resname, undef_code=missing_char)
+                        except KeyError:
+                            res_letter = missing_char
+                        if detailed:
+                            chain_seq.append((res_id, res_letter))
+                        else:
+                            chain_seq.append(res_letter)
+                if detailed:
+                    sequences[chain.id] = chain_seq
+                else:
+                    sequences[chain.id] = ''.join(chain_seq)
        return sequences
    
    def extract_sequences_info(self) -> Dict[str, List[int]]: