Files
vinatools/scripts/qed.py
2025-10-15 20:14:12 +08:00

201 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :qed_calculator.py
@Description :QED calculator with joblib parallel support
@Date :2025/08/04
@Author :lyzeng
'''
from rdkit import Chem
from rdkit.Chem import QED
import pandas as pd
from typing import List, Union, Tuple, Optional
import joblib
from joblib import Parallel, delayed
import logging
# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
"""
Calculate QED value for a single molecule.
Args:
smiles (str): SMILES representation of the molecule
Returns:
tuple: (smiles, qed_value) or None if calculation fails
"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
qed_value = QED.qed(mol)
return (smiles, qed_value)
except Exception as e:
logger.warning(f"Failed to calculate QED for {smiles}: {e}")
pass
return None
def parallel_qed_calculation(
smiles_list: List[str],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES in parallel using joblib.
Args:
smiles_list (List[str]): List of SMILES strings
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
# 并行计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 过滤掉None结果
valid_results = [r for r in results if r is not None]
if not valid_results:
logger.warning("No valid QED values calculated")
return pd.DataFrame(columns=['smiles', 'qed'])
# 分离SMILES和QED值
smiles_values, qed_values = zip(*valid_results)
# 创建DataFrame
df = pd.DataFrame({
'smiles': smiles_values,
'qed': qed_values
})
logger.info(f"Successfully calculated QED values for {len(df)} molecules")
return df
def calculate_qed_series(
smiles_series: Union[List[str], pd.Series],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
n_jobs (int): Number of parallel jobs
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use
Returns:
pd.Series: Series of QED values with the same index as input (if Series)
"""
if isinstance(smiles_series, pd.Series):
smiles_list = smiles_series.tolist()
original_index = smiles_series.index
else:
smiles_list = smiles_series
original_index = None
# 计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 提取QED值失败的计算返回None
qed_values = [r[1] if r is not None else None for r in results]
# 创建Series
if original_index is not None:
return pd.Series(qed_values, index=original_index, name='qed')
else:
return pd.Series(qed_values, name='qed')
class QEDCalculator:
"""
A class for calculating QED values with support for parallel processing and caching.
"""
def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
"""
Initialize the QEDCalculator.
Args:
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
"""
self.n_jobs = n_jobs
self.batch_size = batch_size
self.backend = "loky"
def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES.
Args:
smiles_list (List[str]): List of SMILES strings
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
return parallel_qed_calculation(
smiles_list,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
Returns:
pd.Series: Series of QED values
"""
return calculate_qed_series(
smiles_series,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
"""
usage
from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
# 方式1直接使用函数
smiles_list = ['CCO', 'CCN', 'CCC']
qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
# 方式2使用类
calculator = QEDCalculator(n_jobs=-1)
qed_df = calculator.calculate(smiles_list)
# 方式3处理pandas Series
import pandas as pd
smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
qed_series = calculator.calculate_series(smiles_series)
"""