Files
lingyuzeng f6c182f38e 重构分子属性分析流程并更新目录结构
1. 目录结构调整:
   - 创建scripts目录统一存放分析脚本
   - 保持数据文件在原有目录结构中
   - 生成的CSV文件和PNG图表文件也放在scripts目录下

2. 功能改进:
   - 更新calculate_qed_values.py脚本,添加对参考分子SDF文件的处理
   - 修改analyze_qed_mw_distribution.py脚本,统一使用SDF文件stem名称作为参考分子标识符
   - 改进Vina得分提取逻辑,支持从SDF文件中提取所有构象的得分
   - 完善KDE分布图绘制,确保参考分子在所有图表中显示统一的名称

3. 文档更新:
   - 更新README.md中的目录结构说明
   - 更新命令行和API使用示例
   - 添加详细的使用说明和示例

4. 示例代码:
   - 更新example_api_usage.py以适应新的目录结构和API调用方式
2025-08-05 17:00:39 +08:00

201 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :qed_calculator.py
@Description :QED calculator with joblib parallel support
@Date :2025/08/04
@Author :lyzeng
'''
from rdkit import Chem
from rdkit.Chem import QED
import pandas as pd
from typing import List, Union, Tuple, Optional
import joblib
from joblib import Parallel, delayed
import logging
# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
"""
Calculate QED value for a single molecule.
Args:
smiles (str): SMILES representation of the molecule
Returns:
tuple: (smiles, qed_value) or None if calculation fails
"""
try:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
qed_value = QED.qed(mol)
return (smiles, qed_value)
except Exception as e:
logger.warning(f"Failed to calculate QED for {smiles}: {e}")
pass
return None
def parallel_qed_calculation(
smiles_list: List[str],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES in parallel using joblib.
Args:
smiles_list (List[str]): List of SMILES strings
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
# 并行计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 过滤掉None结果
valid_results = [r for r in results if r is not None]
if not valid_results:
logger.warning("No valid QED values calculated")
return pd.DataFrame(columns=['smiles', 'qed'])
# 分离SMILES和QED值
smiles_values, qed_values = zip(*valid_results)
# 创建DataFrame
df = pd.DataFrame({
'smiles': smiles_values,
'qed': qed_values
})
logger.info(f"Successfully calculated QED values for {len(df)} molecules")
return df
def calculate_qed_series(
smiles_series: Union[List[str], pd.Series],
n_jobs: int = -1,
batch_size: Union[int, str] = "auto",
backend: str = "loky"
) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
n_jobs (int): Number of parallel jobs
batch_size (int or str): Batch size for parallel processing
backend (str): Joblib backend to use
Returns:
pd.Series: Series of QED values with the same index as input (if Series)
"""
if isinstance(smiles_series, pd.Series):
smiles_list = smiles_series.tolist()
original_index = smiles_series.index
else:
smiles_list = smiles_series
original_index = None
# 计算QED值
results = Parallel(
n_jobs=n_jobs,
batch_size=batch_size,
backend=backend
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
# 提取QED值失败的计算返回None
qed_values = [r[1] if r is not None else None for r in results]
# 创建Series
if original_index is not None:
return pd.Series(qed_values, index=original_index, name='qed')
else:
return pd.Series(qed_values, name='qed')
class QEDCalculator:
"""
A class for calculating QED values with support for parallel processing and caching.
"""
def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
"""
Initialize the QEDCalculator.
Args:
n_jobs (int): Number of parallel jobs. -1 means using all processors
batch_size (int or str): Batch size for parallel processing
"""
self.n_jobs = n_jobs
self.batch_size = batch_size
self.backend = "loky"
def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
"""
Calculate QED values for a list of SMILES.
Args:
smiles_list (List[str]): List of SMILES strings
Returns:
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
"""
return parallel_qed_calculation(
smiles_list,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
"""
Calculate QED values for a pandas Series or list of SMILES and return as Series.
Args:
smiles_series: Series or list of SMILES strings
Returns:
pd.Series: Series of QED values
"""
return calculate_qed_series(
smiles_series,
n_jobs=self.n_jobs,
batch_size=self.batch_size,
backend=self.backend
)
"""
usage
from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
# 方式1直接使用函数
smiles_list = ['CCO', 'CCN', 'CCC']
qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
# 方式2使用类
calculator = QEDCalculator(n_jobs=-1)
qed_df = calculator.calculate(smiles_list)
# 方式3处理pandas Series
import pandas as pd
smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
qed_series = calculator.calculate_series(smiles_series)
"""