201 lines
5.8 KiB
Python
201 lines
5.8 KiB
Python
#!/usr/bin/env python
|
||
# -*- encoding: utf-8 -*-
|
||
'''
|
||
@file :qed_calculator.py
|
||
@Description :QED calculator with joblib parallel support
|
||
@Date :2025/08/04
|
||
@Author :lyzeng
|
||
'''
|
||
|
||
from rdkit import Chem
|
||
from rdkit.Chem import QED
|
||
import pandas as pd
|
||
from typing import List, Union, Tuple, Optional
|
||
import joblib
|
||
from joblib import Parallel, delayed
|
||
import logging
|
||
|
||
# 设置日志
|
||
logging.basicConfig(level=logging.INFO)
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
|
||
"""
|
||
Calculate QED value for a single molecule.
|
||
|
||
Args:
|
||
smiles (str): SMILES representation of the molecule
|
||
|
||
Returns:
|
||
tuple: (smiles, qed_value) or None if calculation fails
|
||
"""
|
||
try:
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is not None:
|
||
qed_value = QED.qed(mol)
|
||
return (smiles, qed_value)
|
||
except Exception as e:
|
||
logger.warning(f"Failed to calculate QED for {smiles}: {e}")
|
||
pass
|
||
return None
|
||
|
||
|
||
def parallel_qed_calculation(
|
||
smiles_list: List[str],
|
||
n_jobs: int = -1,
|
||
batch_size: Union[int, str] = "auto",
|
||
backend: str = "loky"
|
||
) -> pd.DataFrame:
|
||
"""
|
||
Calculate QED values for a list of SMILES in parallel using joblib.
|
||
|
||
Args:
|
||
smiles_list (List[str]): List of SMILES strings
|
||
n_jobs (int): Number of parallel jobs. -1 means using all processors
|
||
batch_size (int or str): Batch size for parallel processing
|
||
backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
|
||
|
||
Returns:
|
||
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
|
||
"""
|
||
logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
|
||
|
||
# 并行计算QED值
|
||
results = Parallel(
|
||
n_jobs=n_jobs,
|
||
batch_size=batch_size,
|
||
backend=backend
|
||
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
|
||
|
||
# 过滤掉None结果
|
||
valid_results = [r for r in results if r is not None]
|
||
|
||
if not valid_results:
|
||
logger.warning("No valid QED values calculated")
|
||
return pd.DataFrame(columns=['smiles', 'qed'])
|
||
|
||
# 分离SMILES和QED值
|
||
smiles_values, qed_values = zip(*valid_results)
|
||
|
||
# 创建DataFrame
|
||
df = pd.DataFrame({
|
||
'smiles': smiles_values,
|
||
'qed': qed_values
|
||
})
|
||
|
||
logger.info(f"Successfully calculated QED values for {len(df)} molecules")
|
||
return df
|
||
|
||
|
||
def calculate_qed_series(
|
||
smiles_series: Union[List[str], pd.Series],
|
||
n_jobs: int = -1,
|
||
batch_size: Union[int, str] = "auto",
|
||
backend: str = "loky"
|
||
) -> pd.Series:
|
||
"""
|
||
Calculate QED values for a pandas Series or list of SMILES and return as Series.
|
||
|
||
Args:
|
||
smiles_series: Series or list of SMILES strings
|
||
n_jobs (int): Number of parallel jobs
|
||
batch_size (int or str): Batch size for parallel processing
|
||
backend (str): Joblib backend to use
|
||
|
||
Returns:
|
||
pd.Series: Series of QED values with the same index as input (if Series)
|
||
"""
|
||
if isinstance(smiles_series, pd.Series):
|
||
smiles_list = smiles_series.tolist()
|
||
original_index = smiles_series.index
|
||
else:
|
||
smiles_list = smiles_series
|
||
original_index = None
|
||
|
||
# 计算QED值
|
||
results = Parallel(
|
||
n_jobs=n_jobs,
|
||
batch_size=batch_size,
|
||
backend=backend
|
||
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
|
||
|
||
# 提取QED值(失败的计算返回None)
|
||
qed_values = [r[1] if r is not None else None for r in results]
|
||
|
||
# 创建Series
|
||
if original_index is not None:
|
||
return pd.Series(qed_values, index=original_index, name='qed')
|
||
else:
|
||
return pd.Series(qed_values, name='qed')
|
||
|
||
|
||
class QEDCalculator:
|
||
"""
|
||
A class for calculating QED values with support for parallel processing and caching.
|
||
"""
|
||
|
||
def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
|
||
"""
|
||
Initialize the QEDCalculator.
|
||
|
||
Args:
|
||
n_jobs (int): Number of parallel jobs. -1 means using all processors
|
||
batch_size (int or str): Batch size for parallel processing
|
||
"""
|
||
self.n_jobs = n_jobs
|
||
self.batch_size = batch_size
|
||
self.backend = "loky"
|
||
|
||
def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
|
||
"""
|
||
Calculate QED values for a list of SMILES.
|
||
|
||
Args:
|
||
smiles_list (List[str]): List of SMILES strings
|
||
|
||
Returns:
|
||
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
|
||
"""
|
||
return parallel_qed_calculation(
|
||
smiles_list,
|
||
n_jobs=self.n_jobs,
|
||
batch_size=self.batch_size,
|
||
backend=self.backend
|
||
)
|
||
|
||
def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
|
||
"""
|
||
Calculate QED values for a pandas Series or list of SMILES and return as Series.
|
||
|
||
Args:
|
||
smiles_series: Series or list of SMILES strings
|
||
|
||
Returns:
|
||
pd.Series: Series of QED values
|
||
"""
|
||
return calculate_qed_series(
|
||
smiles_series,
|
||
n_jobs=self.n_jobs,
|
||
batch_size=self.batch_size,
|
||
backend=self.backend
|
||
)
|
||
|
||
"""
|
||
usage
|
||
|
||
from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
|
||
|
||
# 方式1:直接使用函数
|
||
smiles_list = ['CCO', 'CCN', 'CCC']
|
||
qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
|
||
|
||
# 方式2:使用类
|
||
calculator = QEDCalculator(n_jobs=-1)
|
||
qed_df = calculator.calculate(smiles_list)
|
||
|
||
# 方式3:处理pandas Series
|
||
import pandas as pd
|
||
smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
|
||
qed_series = calculator.calculate_series(smiles_series)
|
||
""" |