This commit is contained in:
2025-10-15 20:14:12 +08:00
parent f4afbb7712
commit 7ac0e58599
16 changed files with 6085 additions and 0 deletions

201
scripts/qed.py Normal file
View File

@@ -0,0 +1,201 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :qed_calculator.py
@Description :QED calculator with joblib parallel support
@Date :2025/08/04
@Author :lyzeng
'''
from rdkit import Chem
from rdkit.Chem import QED
import pandas as pd
from typing import List, Union, Tuple, Optional
import joblib
from joblib import Parallel, delayed
import logging
# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
"""
Calculate QED value for a single molecule.
Args:
smiles (str): SMILES representation of the molecule
Returns:
tuple: (smiles, qed_value) or None if calculation fails
"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
qed_value = QED.qed(mol)
return (smiles, qed_value)
except Exception as e:
logger.warning(f"Failed to calculate QED for {smiles}: {e}")
pass
return None
def parallel_qed_calculation(
smiles_list: List[str],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES in parallel using joblib.
Args:
smiles_list (List[str]): List of SMILES strings
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
# 并行计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 过滤掉None结果
valid_results = [r for r in results if r is not None]
if not valid_results:
logger.warning("No valid QED values calculated")
return pd.DataFrame(columns=['smiles', 'qed'])
# 分离SMILES和QED值
smiles_values, qed_values = zip(*valid_results)
# 创建DataFrame
df = pd.DataFrame({
'smiles': smiles_values,
'qed': qed_values
})
logger.info(f"Successfully calculated QED values for {len(df)} molecules")
return df
def calculate_qed_series(
smiles_series: Union[List[str], pd.Series],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
n_jobs (int): Number of parallel jobs
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use
Returns:
pd.Series: Series of QED values with the same index as input (if Series)
"""
if isinstance(smiles_series, pd.Series):
smiles_list = smiles_series.tolist()
original_index = smiles_series.index
else:
smiles_list = smiles_series
original_index = None
# 计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 提取QED值失败的计算返回None
qed_values = [r[1] if r is not None else None for r in results]
# 创建Series
if original_index is not None:
return pd.Series(qed_values, index=original_index, name='qed')
else:
return pd.Series(qed_values, name='qed')
class QEDCalculator:
"""
A class for calculating QED values with support for parallel processing and caching.
"""
def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
"""
Initialize the QEDCalculator.
Args:
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
"""
self.n_jobs = n_jobs
self.batch_size = batch_size
self.backend = "loky"
def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES.
Args:
smiles_list (List[str]): List of SMILES strings
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
return parallel_qed_calculation(
smiles_list,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
Returns:
pd.Series: Series of QED values
"""
return calculate_qed_series(
smiles_series,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
"""
usage
from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
# 方式1直接使用函数
smiles_list = ['CCO', 'CCN', 'CCC']
qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
# 方式2使用类
calculator = QEDCalculator(n_jobs=-1)
qed_df = calculator.calculate(smiles_list)
# 方式3处理pandas Series
import pandas as pd
smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
qed_series = calculator.calculate_series(smiles_series)
"""