#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :qed_calculator.py @Description :QED calculator with joblib parallel support @Date :2025/08/04 @Author :lyzeng ''' from rdkit import Chem from rdkit.Chem import QED import pandas as pd from typing import List, Union, Tuple, Optional import joblib from joblib import Parallel, delayed import logging # 设置日志 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]: """ Calculate QED value for a single molecule. Args: smiles (str): SMILES representation of the molecule Returns: tuple: (smiles, qed_value) or None if calculation fails """ try: mol = Chem.MolFromSmiles(smiles) if mol is not None: qed_value = QED.qed(mol) return (smiles, qed_value) except Exception as e: logger.warning(f"Failed to calculate QED for {smiles}: {e}") pass return None def parallel_qed_calculation( smiles_list: List[str], n_jobs: int = -1, batch_size: Union[int, str] = "auto", backend: str = "loky" ) -> pd.DataFrame: """ Calculate QED values for a list of SMILES in parallel using joblib. Args: smiles_list (List[str]): List of SMILES strings n_jobs (int): Number of parallel jobs. -1 means using all processors batch_size (int or str): Batch size for parallel processing backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing') Returns: pd.DataFrame: DataFrame with 'smiles' and 'qed' columns """ logger.info(f"Calculating QED values for {len(smiles_list)} molecules...") # 并行计算QED值 results = Parallel( n_jobs=n_jobs, batch_size=batch_size, backend=backend )(delayed(calculate_single_qed)(smiles) for smiles in smiles_list) # 过滤掉None结果 valid_results = [r for r in results if r is not None] if not valid_results: logger.warning("No valid QED values calculated") return pd.DataFrame(columns=['smiles', 'qed']) # 分离SMILES和QED值 smiles_values, qed_values = zip(*valid_results) # 创建DataFrame df = pd.DataFrame({ 'smiles': smiles_values, 'qed': qed_values }) logger.info(f"Successfully calculated QED values for {len(df)} molecules") return df def calculate_qed_series( smiles_series: Union[List[str], pd.Series], n_jobs: int = -1, batch_size: Union[int, str] = "auto", backend: str = "loky" ) -> pd.Series: """ Calculate QED values for a pandas Series or list of SMILES and return as Series. Args: smiles_series: Series or list of SMILES strings n_jobs (int): Number of parallel jobs batch_size (int or str): Batch size for parallel processing backend (str): Joblib backend to use Returns: pd.Series: Series of QED values with the same index as input (if Series) """ if isinstance(smiles_series, pd.Series): smiles_list = smiles_series.tolist() original_index = smiles_series.index else: smiles_list = smiles_series original_index = None # 计算QED值 results = Parallel( n_jobs=n_jobs, batch_size=batch_size, backend=backend )(delayed(calculate_single_qed)(smiles) for smiles in smiles_list) # 提取QED值(失败的计算返回None) qed_values = [r[1] if r is not None else None for r in results] # 创建Series if original_index is not None: return pd.Series(qed_values, index=original_index, name='qed') else: return pd.Series(qed_values, name='qed') class QEDCalculator: """ A class for calculating QED values with support for parallel processing and caching. """ def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"): """ Initialize the QEDCalculator. Args: n_jobs (int): Number of parallel jobs. -1 means using all processors batch_size (int or str): Batch size for parallel processing """ self.n_jobs = n_jobs self.batch_size = batch_size self.backend = "loky" def calculate(self, smiles_list: List[str]) -> pd.DataFrame: """ Calculate QED values for a list of SMILES. Args: smiles_list (List[str]): List of SMILES strings Returns: pd.DataFrame: DataFrame with 'smiles' and 'qed' columns """ return parallel_qed_calculation( smiles_list, n_jobs=self.n_jobs, batch_size=self.batch_size, backend=self.backend ) def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series: """ Calculate QED values for a pandas Series or list of SMILES and return as Series. Args: smiles_series: Series or list of SMILES strings Returns: pd.Series: Series of QED values """ return calculate_qed_series( smiles_series, n_jobs=self.n_jobs, batch_size=self.batch_size, backend=self.backend ) """ usage from utils.qed_calculator import parallel_qed_calculation, QEDCalculator # 方式1:直接使用函数 smiles_list = ['CCO', 'CCN', 'CCC'] qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1) # 方式2:使用类 calculator = QEDCalculator(n_jobs=-1) qed_df = calculator.calculate(smiles_list) # 方式3:处理pandas Series import pandas as pd smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles') qed_series = calculator.calculate_series(smiles_series) """