diff --git a/analysis_pdb.py b/analysis_pdb.py index 621c1f7..6aa138b 100755 --- a/analysis_pdb.py +++ b/analysis_pdb.py @@ -146,7 +146,36 @@ class PDBAnalyzer: # Return a new instance of PDBAnalyzer pointing to the cleaned file return cls(out_file) - + def extract_chains_to_new_pdb(self, chains: List[str]) -> PandasPdb: + """ + Extract specified chains into a new PandasPdb object. + + Args: + chains (List[str]): List of chain IDs to be extracted. + + Returns: + PandasPdb: A new PandasPdb object containing only the specified chains. + + Raises: + ValueError: If any of the specified chains are not found in the PDB file. + """ + # Check if all specified chains exist in the PDB file + if not all(chain in self.chain_id_list for chain in chains): + missing_chains = [chain for chain in chains if chain not in self.chain_id_list] + raise ValueError(f"Chains {missing_chains} not found in the PDB file. {self.pdb_file.as_posix()}") + + # Create a new PandasPdb object for the specified chains + new_ppdb = PandasPdb() + + # Extract ATOM records for specified chains + new_ppdb.df['ATOM'] = self.biodata.df['ATOM'][self.biodata.df['ATOM']['chain_id'].isin(chains)] + + # Extract HETATM records for specified chains, if needed + if 'HETATM' in self.biodata.df: + new_ppdb.df['HETATM'] = self.biodata.df['HETATM'][self.biodata.df['HETATM']['chain_id'].isin(chains)] + + return new_ppdb + def sequence_similarity(self, seq1: str, seq2: str) -> float: """ Calculate the similarity between two sequences.