add protein_structure (remove HETATM)

This commit is contained in:
2024-01-11 16:24:40 +08:00
parent 751f3280c9
commit 0495e71a23

View File

@@ -37,6 +37,7 @@ class PDBAnalyzer:
pdb_file: Path pdb_file: Path
pid: Optional[str] = field(default=None, init=False) pid: Optional[str] = field(default=None, init=False)
structure: object = field(init=False) structure: object = field(init=False)
protein_structure: object = field(init=False)
biodf: PandasPdb = field(init=False) biodf: PandasPdb = field(init=False)
protein_state: str = field(init=False) # Apo or Holo protein_state: str = field(init=False) # Apo or Holo
chain_id_list: List[str] = field(init=False) chain_id_list: List[str] = field(init=False)
@@ -45,9 +46,9 @@ class PDBAnalyzer:
""" """
Initialize the PDB structure after the object is created. Initialize the PDB structure after the object is created.
""" """
parser = PDBParser(QUIET=True)
self.pid = self.pdb_file.stem.lower() if len(self.pdb_file.stem) == 4 else None self.pid = self.pdb_file.stem.lower() if len(self.pdb_file.stem) == 4 else None
self.structure = parser.get_structure('PDB_structure', self.pdb_file.as_posix()) self.structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.pdb_file.as_posix())
self.protein_structure = PDBParser(QUIET=True).get_structure('PDB_structure', self.cleanATOM().as_posix())
self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix()) self.biodata = PandasPdb().read_pdb(self.pdb_file.as_posix())
self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo' self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list() self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
@@ -62,7 +63,7 @@ class PDBAnalyzer:
Defaults to <pdb_file>.clean.pdb. Defaults to <pdb_file>.clean.pdb.
ext (str): File extension to use for output file. Defaults to ".clean.pdb" ext (str): File extension to use for output file. Defaults to ".clean.pdb"
""" """
pdb_file = self.path.as_posix() pdb_file = self.pdb_file.as_posix()
# find all ATOM and TER lines # find all ATOM and TER lines
with open(pdb_file, "r") as fid: with open(pdb_file, "r") as fid:
good = [l for l in fid if l.startswith(("ATOM", "TER"))] good = [l for l in fid if l.startswith(("ATOM", "TER"))]
@@ -129,7 +130,8 @@ class PDBAnalyzer:
sequences = {} sequences = {}
# Process each chain in the structure # Process each chain in the structure
for model in self.structure: # use cleanATOM to remove HETATM
for model in self.protein_structure:
chains = model.get_list() chains = model.get_list()
for chain in chains: for chain in chains:
# Check continuity and get the sequence of residues # Check continuity and get the sequence of residues
@@ -155,7 +157,7 @@ class PDBAnalyzer:
return chain.get_id(), sorted(set(full_range) - set(observed)) return chain.get_id(), sorted(set(full_range) - set(observed))
return chain.get_id(), [] return chain.get_id(), []
chains = [chain for model in self.structure for chain in model] chains = [chain for model in self.protein_structure for chain in model]
missing_info = map(find_missing, chains) missing_info = map(find_missing, chains)
return dict(filter(lambda x: x[1], missing_info)) return dict(filter(lambda x: x[1], missing_info))