From 751f3280c947a246787a01da0e6439f13e2d6aeb Mon Sep 17 00:00:00 2001
From: hotwa <pylyzeng@gmail.com>
Date: Thu, 11 Jan 2024 16:10:44 +0800
Subject: [PATCH] add cleanATOM

---
 analysis_pdb.py | 182 ++++++++++++++++++++++++++++++------------------
 1 file changed, 115 insertions(+), 67 deletions(-)

diff --git a/analysis_pdb.py b/analysis_pdb.py
index 9f5255a..ddcbeb7 100755
--- a/analysis_pdb.py
+++ b/analysis_pdb.py
@@ -9,7 +9,7 @@
 @version            :1.0
 '''
 # micromamba create -n modeller modeller biopython pymol-open-source biopandas requests -y -c conda-forge -c salilab
-# modeller注册码：MODELIRANJE
+# modeller注册码：MODELIRANJE (<conda_env>//lib/modeller-10.4/modlib/modeller/config.py)
 from dataclasses import dataclass, field
 from Bio.PDB import PDBParser
 from Bio.SeqUtils import seq1
@@ -23,6 +23,7 @@ from Bio import SeqIO
 import requests
 from copy import deepcopy
 from pymol import cmd
+import os
 
 @dataclass
 class PDBAnalyzer:
@@ -51,6 +52,31 @@ class PDBAnalyzer:
         self.protein_state = 'Holo' if 'HETATM' in self.biodata.df.keys() else 'Apo'
         self.chain_id_list = self.biodata.df['ATOM']['chain_id'].drop_duplicates().to_list()
 
+    def cleanATOM(self, out_file=None, ext="_clean.pdb") -> Path: # from pyrosetta.toolbox import cleanATOM
+        """Extract all ATOM and TER records in a PDB file and write them to a new file.
+
+        Args:
+            pdb_file (str): Path of the PDB file from which ATOM and TER records
+                will be extracted
+            out_file (str): Optional argument to specify a particular output filename.
+                Defaults to <pdb_file>.clean.pdb.
+            ext (str): File extension to use for output file. Defaults to ".clean.pdb"
+        """
+        pdb_file = self.path.as_posix()
+        # find all ATOM and TER lines
+        with open(pdb_file, "r") as fid:
+            good = [l for l in fid if l.startswith(("ATOM", "TER"))]
+
+        # default output file to <pdb_file>_clean.pdb
+        if out_file is None:
+            out_file = os.path.splitext(pdb_file)[0] + ext
+
+        # write the selected records to a new file
+        with open(out_file, "w") as fid:
+            fid.writelines(good)
+        return Path(out_file)
+
+
     def check_continuity(self, chain, missing_char):
         """ 
         Check the continuity of residues in a protein chain.
@@ -370,6 +396,92 @@ def import_and_merge_pdb_strings(pdb_strings, merged_object_name, output_file):
     # 保存合并后的对象
     cmd.save(output_file, merged_object_name)
 
+'''
+# 示例: 使用biopython提取A链（将会保留HETATM）
+chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
+chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
+# 示例: 使用biopandas提取A链（将不会保留HETATM）
+chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
+chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
+# A链改B链, 并分割保存为单独文件
+analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
+# 分割所有的链
+split_dict = {}
+for j in analyzer.chain_id_list:
+    fn = Path(f'{PDB_ID}_{j}.pdb')
+    analyzer.split_chain(j).to_pdb(fn.as_posix())
+    split_dict[j]=fn.read_text()
+'''
+
+def fix_all(path:Path):
+    pdbfiles = [i for i in path.glob('*.pdb')]
+    for i in pdbfiles:
+        PDB_file_path = i
+        PDB_ID = i.stem
+        analyzer = PDBAnalyzer(PDB_file_path)
+        sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
+        print(f'Residues info for {PDB_ID}: \n',sequences)
+        missing_info = analyzer.extract_sequences_info()
+        print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
+        # 示例: 使用biopython提取A链（将会保留HETATM）
+        chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
+        chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
+        # 示例: 使用biopandas提取A链（将不会保留HETATM）
+        chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
+        chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
+        # A链改B链, 并分割保存为单独文件
+        analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
+        # 分割所有的链
+        split_dict = {}
+        for j in analyzer.chain_id_list:
+            fn = Path(f'{PDB_ID}_{j}.pdb')
+            analyzer.split_chain(j).to_pdb(fn.as_posix())
+            split_dict[j]=fn.read_text()
+        # 修复loop区域
+        from build_modeller import PDBModeler
+        from modeller import ModellerError
+        mc_dict = {}
+        for mc in missing_info:
+            out_file = f'5sws_{mc}.pdb'
+            analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
+            mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
+            if len(mc_fasta) == 1:
+                mc_fasta = mc_fasta[0]
+                out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
+                analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
+                print(f'>{mc_fasta.description}')
+                print(mc_fasta.seq)
+                modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
+                try:
+                    modeller_results = modeller.make_model()
+                except ModellerError:
+                    print(f'Failed to build model for chain {mc}')
+                    print(f'No loops detected in {out_fasta_file.name}')
+                    print(f'may pdb file sequence is not correct')
+                    continue
+                except Exception as e:
+                    raise e
+                print(f'Model files: {[file.name for file in modeller_results]}')
+                # change id to original
+                for i in modeller_results:
+                    manalyzer = PDBAnalyzer(i)
+                    manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
+                if len(modeller_results) == 1:
+                    # use pymol to align
+                    aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
+                    pdbstr = aligner.align()
+                    mc_dict[mc] = pdbstr
+                else:
+                    print('more than one model file, please set num_loop to 1')
+            elif len(mc_fasta) == 0:
+                continue
+            else:
+                raise ValueError(f'only can fix one chain content: {mc_fasta}')
+
+        # 使用示例
+        split_dict.update(mc_dict)  # 更新 split_dict
+        import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')
+
 if __name__ == "__main__":
     # import argparse
     # parser = argparse.ArgumentParser(description="Build model by Modeller")
@@ -380,71 +492,7 @@ if __name__ == "__main__":
     # parser.add_argument("-m", "--md_level", help="MD level")
     # parser.add_argument("-c", "--chain", help="Chain ID")
     # args = parser.parse_args()
-    pdbfiles = [i for i in Path('../PDBfile').glob('*.pdb')]
-    for i in pdbfiles:
-            PDB_file_path = i
-            PDB_ID = i.stem
-            analyzer = PDBAnalyzer(PDB_file_path)
-            sequences = analyzer.extract_sequences(missing_char='-')  # 或者 'X', 或者 ''
-            print(f'Residues info for {PDB_ID}: \n',sequences)
-            missing_info = analyzer.extract_sequences_info()
-            print(f'Missing residues info for {PDB_ID}:\n {missing_info}')
-            # 示例: 使用biopython提取A链（将会保留HETATM）
-            chain_extractor = analyzer.extract_chain('A')  # 假设要提取的链ID是 'A'
-            chain_extractor.save('biopython_extracted_chain_A.pdb')  # 保存为PDB文件
-            # 示例: 使用biopandas提取A链（将不会保留HETATM）
-            chain_extractor = analyzer.split_chain('A')  # 假设要提取的链ID是 'A'
-            chain_extractor.to_pdb('biopandas_extracted_chain_A.pdb')  # 保存为PDB文件
-            # A链改B链, 并分割保存为单独文件
-            analyzer.change_chain_identifier('A', 'B', split=True).to_pdb(f'{PDB_ID}_B.pdb')
-            # 分割所有的链
-            split_dict = {}
-            for j in analyzer.chain_id_list:
-                fn = Path(f'{PDB_ID}_{j}.pdb')
-                analyzer.split_chain(j).to_pdb(fn.as_posix())
-                split_dict[j]=fn.read_text()
-            # 修复loop区域
-            from build_modeller import PDBModeler
-            from modeller import ModellerError
-            mc_dict = {}
-            for mc in missing_info:
-                out_file = f'5sws_{mc}.pdb'
-                analyzer.split_chain(mc).to_pdb(out_file) # get misschain pdb file
-                mc_fasta = analyzer.filter_sequences(mc) # get misschain fasta file
-                if len(mc_fasta) == 1:
-                    mc_fasta = mc_fasta[0]
-                    out_fasta_file = Path(f'{PDB_ID}_{mc}.fasta')
-                    analyzer.write_seq_to_fasta_single_line(mc_fasta, out_fasta_file)
-                    print(f'>{mc_fasta.description}')
-                    print(mc_fasta.seq)
-                    modeller = PDBModeler(PDB_file_path, out_fasta_file, Path('.'), mc, 1, 'refine.very_fast')
-                    try:
-                        modeller_results = modeller.make_model()
-                    except ModellerError:
-                        print(f'Failed to build model for chain {mc}')
-                        print(f'No loops detected in {out_fasta_file.name}')
-                        print(f'may pdb file sequence is not correct')
-                        continue
-                    except Exception as e:
-                        raise e
-                    print(f'Model files: {[file.name for file in modeller_results]}')
-                    # change id to original
-                    for i in modeller_results:
-                        manalyzer = PDBAnalyzer(i)
-                        manalyzer.change_chain_identifier('A', mc, split=False).to_pdb(i)
-                    if len(modeller_results) == 1:
-                        # use pymol to align
-                        aligner = PDBAlign(PDB_file_path, modeller_results[0],Path(f'{PDB_ID}_merge_model.pdb'))
-                        pdbstr = aligner.align()
-                        mc_dict[mc] = pdbstr
-                    else:
-                        print('more than one model file, please set num_loop to 1')
-                elif len(mc_fasta) == 0:
-                    continue
-                else:
-                    raise ValueError(f'only can fix one chain content: {mc_fasta}')
+    # fix_all(Path('./pdb_test1'))
+    pass
 
-            # 使用示例
-            split_dict.update(mc_dict)  # 更新 split_dict
-            import_and_merge_pdb_strings(split_dict, "merged_object", f'{PDB_ID}.modellerfix.pdb')