analysis_pdb/runner.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file               :runner.py
@Description:       :
@Date               :2023/12/04 14:34:36
@Author             :lyzeng
@Email              :pylyzeng@gmail.com
@version            :1.0
'''

import time
import logging
from dataclasses import dataclass, field
from pathlib import Path
import subprocess
import multiprocessing
import shutil
import os

# 设置日志记录
logging.basicConfig(level=logging.INFO, filename='simulation_log.log', filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

@dataclass
class SimulationRunner:
    pdb_file: Path
    nsteps: int
    dt: float
    base_folder: Path
    bash_script: Path = None  # Bash脚本路径作为可选参数
    gmxrc_path: Path = None  # GMXRC文件路径作为可选参数
    gpu_id: int = None
    temp_folder: Path = field(init=False)  # 用于存储临时数据和结果数据的路径(轨迹数据处理中间数据等)
    runner_folder: Path = field(init=False)
    tpr_file: Path = field(init=False)
    xtc_file: Path = field(init=False)

    def __post_init__(self):
        # 初始化文件夹和文件路径
        self.runner_folder = self.base_folder / f"runner_{self.pdb_file.stem}"
        self.tpr_file = self.runner_folder / "md.tpr"
        self.xtc_file = self.runner_folder / "md.xtc"
        self.runner_folder.mkdir(exist_ok=True)
        self.temp_folder = self.runner_folder / "temp"
        self.temp_folder.mkdir(exist_ok=True)
        self.bash_script = self.bash_script.absolute() if self.bash_script else Path(__file__).resolve().parent / "md_gromacs.sh"
        # 设置 GMXRC_PATH 环境变量
        self.gmxrc_path = self.gmxrc_path

    def copy_pdb(self):
        shutil.copy(self.pdb_file, self.runner_folder / self.pdb_file.name)

    def set_gpu(self, gpu_id):
        """设置要使用的GPU。"""
        self.gpu_id = gpu_id

    @staticmethod
    def read_ndx_file(filename):
        ndx_dict = {}
        current_section = None

        with open(filename, 'r') as file:
            for line in file:
                line = line.strip()

                if line.startswith('[') and line.endswith(']'):
                    current_section = line[1:-1].strip()
                    ndx_dict[current_section] = []
                else:
                    if current_section is not None:
                        ndx_dict[current_section].extend(map(int, line.split()))

        return ndx_dict

    # 新增处理轨迹的方法
    def process_trajectory(self, extract_interval):
        # echo "Protein" | gmx_mpi trjconv -dt {extract_interval} -s {tpr_file} -f {xtc_file} -n {temp_folder}/tarj_show.ndx -pbc mol -o {temp_folder}/temp.xtc
        # 根据提供的脚本逻辑读取和保存索引文件
        ndx_dict = self.read_ndx_file(f'{self.runner_folder}/index.ndx')

        # 根据索引文件内容决定如何处理轨迹
        if any(key.startswith("LG") for key in ndx_dict):
            # 处理含有LG组的情况
            new_ndx_dict = {key: value for key, value in ndx_dict.items() if key.startswith("LG") or key in ["Protein", "Protein_LIG"]}
            self.save_ndx_file(f"{self.temp_folder}/tarj_show.ndx", new_ndx_dict)
            # 构建处理轨迹的命令
            command_1 = f'echo "Protein_LIG" | gmx trjconv -dt {extract_interval} -s {self.tpr_file} -f {self.xtc_file} -n {self.temp_folder}/tarj_show.ndx -pbc mol -o {self.temp_folder}/temp.xtc'
            command_2 = f'echo "Protein\nProtein\nProtein_LIG" | gmx trjconv -s {self.tpr_file} -f {self.temp_folder}/temp.xtc -n {self.temp_folder}/tarj_show.ndx -center -fit rot+trans -o {self.output_folder}/traj_show.xtc'
            command_3 = f'echo "Protein\nProtein\nProtein_LIG" | gmx trjconv -s {self.tpr_file} -f {self.temp_folder}/temp.xtc -n {self.temp_folder}/tarj_show.ndx -center -fit rot+trans -b 0 -e 0 -o {self.output_folder}/tarj_show.pdb'
        else:
            # 处理只含有蛋白质组的情况
            new_ndx_dict = {key: value for key, value in ndx_dict.items() if key in ["Protein"]}
            self.save_ndx_file(f"{self.temp_folder}/tarj_show.ndx", new_ndx_dict)
            # 构建处理轨迹的命令
            command_1 = f'echo "Protein" | gmx trjconv -dt {extract_interval} -s {self.tpr_file} -f {self.xtc_file} -n {self.temp_folder}/tarj_show.ndx -pbc mol -o {self.temp_folder}/temp.xtc'
            command_2 = f'echo "Protein\nProtein\nProtein" | gmx trjconv -s {self.tpr_file} -f {self.temp_folder}/temp.xtc -n {self.temp_folder}/tarj_show.ndx -center -fit rot+trans -o {self.output_folder}/traj_show.xtc'
            command_3 = f'echo "Protein\nProtein\nProtein" | gmx trjconv -s {self.tpr_file} -f {self.temp_folder}/temp.xtc -n {self.temp_folder}/tarj_show.ndx -center -fit rot+trans -b 0 -e 0 -o {self.output_folder}/tarj_show.pdb'
        subprocess.run(command_1, shell=True, check=True)
        subprocess.run(command_2, shell=True, check=True)
        subprocess.run(command_3, shell=True, check=True)

    def run_simulation(self):
        start_time = time.time()
        result = None
        try:
            env_vars = {
                "NAME": self.pdb_file.stem,
                "NSTEPS": str(self.nsteps),
                "DT": str(self.dt),
                "GMXRC_PATH": str(self.gmxrc_path),
                "PATH": os.environ.get("PATH", ""),
                "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH", ""),
                "HOME": os.environ["HOME"],
                "CUDA_VISIBLE_DEVICES": str(self.gpu_id) if self.gpu_id is not None else ""
            }
            logger.info(f"pdb_file: {self.pdb_file.name}")
            logger.info(f"Executing script at: {self.bash_script}")
            result = subprocess.run(["bash", str(self.bash_script)], env=env_vars, cwd=self.runner_folder,
                                    capture_output=True, text=True, check=True)
        except subprocess.CalledProcessError as e:
            logger.error(f"Error in simulation for {self.pdb_file.name}: {e}")
            if e.stdout:
                logger.error(f"Standard Output:\n{e.stdout}")
            if e.stderr:
                logger.error(f"Standard Error:\n{e.stderr}")

        end_time = time.time()
        duration = end_time - start_time

        if result:
            logger.info(f"Simulation for {self.pdb_file.name} completed successfully in {duration:.2f} seconds.")
            if result.stdout:
                logger.info(f"Shell Script Output:\n{result.stdout}")
            if result.stderr:
                logger.error(f"Shell Script Error Output:\n{result.stderr}")
        else:
            logger.error(f"Simulation for {self.pdb_file.name} failed in {duration:.2f} seconds.")

# def main(simulation_steps, time_step, pdb_folder_path, bash_script_path, gmxrc_path):
#     pdb_folder = Path(pdb_folder_path).resolve()
#     for pdb_file in pdb_folder.glob("*.pdb"):
#         runner = SimulationRunner(pdb_file, simulation_steps, time_step, pdb_folder, bash_script_path, gmxrc_path)
#         runner.copy_pdb()
#         logger.info(f"Running simulation for {pdb_file.name} in {runner.runner_folder}...")
#         runner.run_simulation()
#         logger.info(f"Finished simulation for {pdb_file.name}.")
#         runner.process_trajectory(extract_interval=100)  # 例如，每100ps抽取一次轨迹
#         logger.info(f"Finished processing trajectory for {pdb_file.name}. per 100ps")
def detect_gpus():
    """检测系统上的GPU数量。"""
    try:
        output = subprocess.check_output("nvidia-smi -L", shell=True).decode('utf-8')
        return len(output.strip().split('\n'))
    except subprocess.CalledProcessError:
        return 0

def run_simulation_task(pdb_file, simulation_steps, time_step, pdb_folder, bash_script_path, gmxrc_path, gpu_id):
    runner = SimulationRunner(pdb_file, simulation_steps, time_step, pdb_folder, bash_script_path, gmxrc_path)
    runner.set_gpu(gpu_id)  # 设置要使用的GPU
    runner.copy_pdb()
    logger.info(f"Running simulation for {pdb_file.name} on GPU {gpu_id}...")
    runner.run_simulation()
    runner.process_trajectory(extract_interval=100)
    logger.info(f"Finished processing trajectory for {pdb_file.name} on GPU {gpu_id}.")

def main(simulation_steps, time_step, pdb_folder_path, bash_script_path, gmxrc_path):
    pdb_folder = Path(pdb_folder_path).resolve()
    pdb_files = list(pdb_folder.glob("*.pdb"))
    num_gpus = detect_gpus()

    if num_gpus == 0:
        logger.error("No GPUs detected, exiting.")
        return

    # 创建一个进程池，每个GPU运行两个任务
    with multiprocessing.Pool(processes=num_gpus * 2) as pool:
        for i, pdb_file in enumerate(pdb_files):
            gpu_id = i % num_gpus  # 分配GPU
            pool.apply_async(run_simulation_task, (pdb_file, simulation_steps, time_step, pdb_folder, bash_script_path, gmxrc_path, gpu_id))

        pool.close()
        pool.join()

if __name__ == "__main__":
    NSTEPS = 50000000  # Example: 50000000 steps
    DT = 0.002  # Example: 2 fs time step
    PDB_FOLDER_PATH = Path("./pdb_test")  # Assuming the PDB files are in a folder named 'pdb_files' in the current directory
    # 传入自定义的bash脚本路径
    CUSTOM_BASH_SCRIPT_PATH = Path('md_gromacs.sh')
    # 传入 GMXRC 文件的路径
    GMXRC_PATH = Path('/usr/local/gromacs-2021.4-plumed-2.8.0/bin/GMXRC')
    main(NSTEPS, DT, PDB_FOLDER_PATH, CUSTOM_BASH_SCRIPT_PATH, GMXRC_PATH)