重构分子属性分析流程并更新目录结构
1. 目录结构调整: - 创建scripts目录统一存放分析脚本 - 保持数据文件在原有目录结构中 - 生成的CSV文件和PNG图表文件也放在scripts目录下 2. 功能改进: - 更新calculate_qed_values.py脚本,添加对参考分子SDF文件的处理 - 修改analyze_qed_mw_distribution.py脚本,统一使用SDF文件stem名称作为参考分子标识符 - 改进Vina得分提取逻辑,支持从SDF文件中提取所有构象的得分 - 完善KDE分布图绘制,确保参考分子在所有图表中显示统一的名称 3. 文档更新: - 更新README.md中的目录结构说明 - 更新命令行和API使用示例 - 添加详细的使用说明和示例 4. 示例代码: - 更新example_api_usage.py以适应新的目录结构和API调用方式
This commit is contained in:
201
scripts/qed.py
Normal file
201
scripts/qed.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :qed_calculator.py
|
||||
@Description :QED calculator with joblib parallel support
|
||||
@Date :2025/08/04
|
||||
@Author :lyzeng
|
||||
'''
|
||||
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import QED
|
||||
import pandas as pd
|
||||
from typing import List, Union, Tuple, Optional
|
||||
import joblib
|
||||
from joblib import Parallel, delayed
|
||||
import logging
|
||||
|
||||
# 设置日志
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
|
||||
"""
|
||||
Calculate QED value for a single molecule.
|
||||
|
||||
Args:
|
||||
smiles (str): SMILES representation of the molecule
|
||||
|
||||
Returns:
|
||||
tuple: (smiles, qed_value) or None if calculation fails
|
||||
"""
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol is not None:
|
||||
qed_value = QED.qed(mol)
|
||||
return (smiles, qed_value)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to calculate QED for {smiles}: {e}")
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def parallel_qed_calculation(
|
||||
smiles_list: List[str],
|
||||
n_jobs: int = -1,
|
||||
batch_size: Union[int, str] = "auto",
|
||||
backend: str = "loky"
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Calculate QED values for a list of SMILES in parallel using joblib.
|
||||
|
||||
Args:
|
||||
smiles_list (List[str]): List of SMILES strings
|
||||
n_jobs (int): Number of parallel jobs. -1 means using all processors
|
||||
batch_size (int or str): Batch size for parallel processing
|
||||
backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
|
||||
"""
|
||||
logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
|
||||
|
||||
# 并行计算QED值
|
||||
results = Parallel(
|
||||
n_jobs=n_jobs,
|
||||
batch_size=batch_size,
|
||||
backend=backend
|
||||
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
|
||||
|
||||
# 过滤掉None结果
|
||||
valid_results = [r for r in results if r is not None]
|
||||
|
||||
if not valid_results:
|
||||
logger.warning("No valid QED values calculated")
|
||||
return pd.DataFrame(columns=['smiles', 'qed'])
|
||||
|
||||
# 分离SMILES和QED值
|
||||
smiles_values, qed_values = zip(*valid_results)
|
||||
|
||||
# 创建DataFrame
|
||||
df = pd.DataFrame({
|
||||
'smiles': smiles_values,
|
||||
'qed': qed_values
|
||||
})
|
||||
|
||||
logger.info(f"Successfully calculated QED values for {len(df)} molecules")
|
||||
return df
|
||||
|
||||
|
||||
def calculate_qed_series(
|
||||
smiles_series: Union[List[str], pd.Series],
|
||||
n_jobs: int = -1,
|
||||
batch_size: Union[int, str] = "auto",
|
||||
backend: str = "loky"
|
||||
) -> pd.Series:
|
||||
"""
|
||||
Calculate QED values for a pandas Series or list of SMILES and return as Series.
|
||||
|
||||
Args:
|
||||
smiles_series: Series or list of SMILES strings
|
||||
n_jobs (int): Number of parallel jobs
|
||||
batch_size (int or str): Batch size for parallel processing
|
||||
backend (str): Joblib backend to use
|
||||
|
||||
Returns:
|
||||
pd.Series: Series of QED values with the same index as input (if Series)
|
||||
"""
|
||||
if isinstance(smiles_series, pd.Series):
|
||||
smiles_list = smiles_series.tolist()
|
||||
original_index = smiles_series.index
|
||||
else:
|
||||
smiles_list = smiles_series
|
||||
original_index = None
|
||||
|
||||
# 计算QED值
|
||||
results = Parallel(
|
||||
n_jobs=n_jobs,
|
||||
batch_size=batch_size,
|
||||
backend=backend
|
||||
)(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
|
||||
|
||||
# 提取QED值(失败的计算返回None)
|
||||
qed_values = [r[1] if r is not None else None for r in results]
|
||||
|
||||
# 创建Series
|
||||
if original_index is not None:
|
||||
return pd.Series(qed_values, index=original_index, name='qed')
|
||||
else:
|
||||
return pd.Series(qed_values, name='qed')
|
||||
|
||||
|
||||
class QEDCalculator:
|
||||
"""
|
||||
A class for calculating QED values with support for parallel processing and caching.
|
||||
"""
|
||||
|
||||
def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
|
||||
"""
|
||||
Initialize the QEDCalculator.
|
||||
|
||||
Args:
|
||||
n_jobs (int): Number of parallel jobs. -1 means using all processors
|
||||
batch_size (int or str): Batch size for parallel processing
|
||||
"""
|
||||
self.n_jobs = n_jobs
|
||||
self.batch_size = batch_size
|
||||
self.backend = "loky"
|
||||
|
||||
def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
|
||||
"""
|
||||
Calculate QED values for a list of SMILES.
|
||||
|
||||
Args:
|
||||
smiles_list (List[str]): List of SMILES strings
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
|
||||
"""
|
||||
return parallel_qed_calculation(
|
||||
smiles_list,
|
||||
n_jobs=self.n_jobs,
|
||||
batch_size=self.batch_size,
|
||||
backend=self.backend
|
||||
)
|
||||
|
||||
def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
|
||||
"""
|
||||
Calculate QED values for a pandas Series or list of SMILES and return as Series.
|
||||
|
||||
Args:
|
||||
smiles_series: Series or list of SMILES strings
|
||||
|
||||
Returns:
|
||||
pd.Series: Series of QED values
|
||||
"""
|
||||
return calculate_qed_series(
|
||||
smiles_series,
|
||||
n_jobs=self.n_jobs,
|
||||
batch_size=self.batch_size,
|
||||
backend=self.backend
|
||||
)
|
||||
|
||||
"""
|
||||
usage
|
||||
|
||||
from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
|
||||
|
||||
# 方式1:直接使用函数
|
||||
smiles_list = ['CCO', 'CCN', 'CCC']
|
||||
qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
|
||||
|
||||
# 方式2:使用类
|
||||
calculator = QEDCalculator(n_jobs=-1)
|
||||
qed_df = calculator.calculate(smiles_list)
|
||||
|
||||
# 方式3:处理pandas Series
|
||||
import pandas as pd
|
||||
smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
|
||||
qed_series = calculator.calculate_series(smiles_series)
|
||||
"""
|
||||
Reference in New Issue
Block a user