update extract_sequences
This commit is contained in:
@@ -102,15 +102,18 @@ class PDBAnalyzer:
|
|||||||
biodf: PandasPdb = field(init=False)
|
biodf: PandasPdb = field(init=False)
|
||||||
protein_state: str = field(init=False) # Apo or Holo
|
protein_state: str = field(init=False) # Apo or Holo
|
||||||
chain_id_list: List[str] = field(init=False)
|
chain_id_list: List[str] = field(init=False)
|
||||||
|
log_file: Path = field(init=False)
|
||||||
logger: logging.Logger = field(init=False)
|
logger: logging.Logger = field(init=False)
|
||||||
|
|
||||||
def __post_init__(self):
|
def __post_init__(self):
|
||||||
"""
|
"""
|
||||||
Initialize the PDB structure after the object is created.
|
Initialize the PDB structure after the object is created.
|
||||||
"""
|
"""
|
||||||
|
self.log_file = self.pdb_file.with_suffix('.log') # 设置日志文件路径
|
||||||
self._configure_logging()
|
self._configure_logging()
|
||||||
self._initialize_structure()
|
self._initialize_structure()
|
||||||
self._initialize_properties()
|
self._initialize_properties()
|
||||||
|
self.check_and_log_sequence_issues() # 检查并记录编号问题
|
||||||
|
|
||||||
def _initialize_structure(self):
|
def _initialize_structure(self):
|
||||||
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
|
self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
|
||||||
@@ -124,6 +127,7 @@ class PDBAnalyzer:
|
|||||||
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
|
||||||
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
|
||||||
|
|
||||||
|
|
||||||
def _configure_logging(self):
|
def _configure_logging(self):
|
||||||
"""配置日志系统。"""
|
"""配置日志系统。"""
|
||||||
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
|
self.logger = logging.getLogger(f"PDBAnalyzer_{self.pdb_file.stem}")
|
||||||
@@ -332,30 +336,38 @@ class PDBAnalyzer:
|
|||||||
# Convert standard residue names to single-letter codes
|
# Convert standard residue names to single-letter codes
|
||||||
return seq1(residue_name)
|
return seq1(residue_name)
|
||||||
|
|
||||||
def extract_sequences(self, missing_char='-') -> Dict[str, str]:
|
def extract_sequences(self, missing_char: str = '-', detailed: bool = False) -> Dict[str, Union[str, List[Tuple[int, str]]]]:
|
||||||
"""
|
"""
|
||||||
Extract amino acid sequences from the structure, with gaps denoted by the specified character.
|
Extract amino acid sequences from the structure, with gaps denoted by the specified character.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
missing_char (str): Character to use for missing residues (default is '-').
|
missing_char (str): Character to use for missing residues (default is '-').
|
||||||
|
detailed (bool): If True, returns detailed sequence info as {"chain_ID": ("sequence_number", "amino_single_char")}.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dict[str, str]: Dictionary of chain IDs mapped to their amino acid sequences.
|
Dict[str, Union[str, List[Tuple[int, str]]]]: Dictionary of chain IDs mapped to their amino acid sequences
|
||||||
|
or detailed sequence info depending on the detailed flag.
|
||||||
"""
|
"""
|
||||||
# Create a continuity check function with the specified missing character
|
|
||||||
check_continuity_new = partial(self.check_continuity, missing_char=missing_char)
|
|
||||||
|
|
||||||
sequences = {}
|
sequences = {}
|
||||||
# Process each chain in the structure
|
for model in self.structure:
|
||||||
for model in self.protein_structure:
|
for chain in model:
|
||||||
chains = model.get_list()
|
chain_seq = []
|
||||||
for chain in chains:
|
for residue in chain:
|
||||||
# Check continuity and get the sequence of residues
|
if residue.id[0] == ' ' or residue.id[0] == 'H_MSE': # MSE is treated as MET in SEQRES
|
||||||
chain_sequence = check_continuity_new(chain)
|
res_id = residue.id[1]
|
||||||
# Convert to single-letter sequence
|
try:
|
||||||
single_letter_sequence = ''.join(self.residue_to_single_letter(res[0]) for res in chain_sequence)
|
# Convert three-letter code to one-letter code, "-" for gaps
|
||||||
sequences[chain.id] = single_letter_sequence
|
res_letter = seq1(residue.resname, undef_code=missing_char)
|
||||||
|
except KeyError:
|
||||||
|
res_letter = missing_char
|
||||||
|
if detailed:
|
||||||
|
chain_seq.append((res_id, res_letter))
|
||||||
|
else:
|
||||||
|
chain_seq.append(res_letter)
|
||||||
|
if detailed:
|
||||||
|
sequences[chain.id] = chain_seq
|
||||||
|
else:
|
||||||
|
sequences[chain.id] = ''.join(chain_seq)
|
||||||
return sequences
|
return sequences
|
||||||
|
|
||||||
def extract_sequences_info(self) -> Dict[str, List[int]]:
|
def extract_sequences_info(self) -> Dict[str, List[int]]:
|
||||||
|
|||||||
Reference in New Issue
Block a user