Files
vina_docking_batch/scripts/calculate_qed_values.py
2025-08-05 20:37:33 +08:00

228 lines
7.7 KiB
Python

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@file :calculate_qed_values.py
@Description :Calculate QED values for molecules in poses_all directories and reference molecules
@Date :2025/08/04
@Author :lyzeng
'''
import pandas as pd
from rdkit import Chem
from rdkit.Chem import QED
from rdkit.Chem.Descriptors import MolWt
from pathlib import Path
import logging
import json
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def get_smiles_from_sdf(sdf_file_path):
"""
Extract SMILES from the first molecule in an SDF file
Args:
sdf_file_path (Path): Path to the SDF file
Returns:
str: SMILES representation of the first molecule or None if failed
"""
try:
supplier = Chem.SDMolSupplier(str(sdf_file_path))
mol = next(supplier) # Get the first molecule
if mol is not None:
smiles = Chem.MolToSmiles(mol)
return smiles
except Exception as e:
logger.warning(f"Failed to extract SMILES from {sdf_file_path}: {e}")
return None
def get_smiles_from_mol2(mol2_file_path):
"""
Extract SMILES from a mol2 file
Args:
mol2_file_path (Path): Path to the mol2 file
Returns:
str: SMILES representation of the molecule or None if failed
"""
try:
mol = Chem.MolFromMol2File(str(mol2_file_path))
if mol is not None:
smiles = Chem.MolToSmiles(mol)
return smiles
except Exception as e:
logger.warning(f"Failed to extract SMILES from {mol2_file_path}: {e}")
return None
def extract_vina_scores_from_sdf(sdf_file_path):
"""
Extract Vina scores from all conformers in an SDF file
Args:
sdf_file_path (Path): Path to the SDF file
Returns:
list: List of Vina scores (free_energy values) or empty list if failed
"""
scores = []
try:
supplier = Chem.SDMolSupplier(str(sdf_file_path), removeHs=False)
for mol in supplier:
if mol is None:
continue
# Get the meeko property which contains docking information
if mol.HasProp("meeko"):
meeko_raw = mol.GetProp("meeko")
try:
meeko_dict = json.loads(meeko_raw)
# Extract free energy (Vina score)
if 'free_energy' in meeko_dict:
scores.append(meeko_dict['free_energy'])
except json.JSONDecodeError:
logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}")
else:
logger.warning(f"No meeko property found in molecule from {sdf_file_path}")
except Exception as e:
logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}")
return scores
def calculate_qed_for_poses_all(base_dir, dataset_name):
"""
Calculate QED values for all SDF files in poses_all directory
Args:
base_dir (Path): Base directory containing the poses_all folder
dataset_name (str): Name of the dataset (fgbar or trpe)
Returns:
list: List of dictionaries with smiles, filename, qed, and molecular weight values
"""
results = []
poses_all_dir = base_dir / dataset_name / "poses_all"
if not poses_all_dir.exists():
logger.warning(f"Directory {poses_all_dir} does not exist")
return results
sdf_files = list(poses_all_dir.glob("*.sdf"))
logger.info(f"Processing {len(sdf_files)} SDF files in {poses_all_dir}")
for sdf_file in sdf_files:
smiles = get_smiles_from_sdf(sdf_file)
vina_scores = extract_vina_scores_from_sdf(sdf_file)
if smiles is not None:
try:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
qed_value = QED.qed(mol)
mol_weight = MolWt(mol)
results.append({
'smiles': smiles,
'filename': sdf_file.name,
'qed': qed_value,
'molecular_weight': mol_weight,
'vina_scores': vina_scores # 添加Vina得分列表
})
except Exception as e:
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
return results
def calculate_qed_for_reference(base_dir, dataset_name):
"""
Calculate QED values for reference molecules in SDF format
Args:
base_dir (Path): Base directory containing the reference folder
dataset_name (str): Name of the dataset (fgbar or trpe)
Returns:
list: List of dictionaries with smiles, filename, qed, molecular weight, and vina scores values
"""
results = []
# 使用原始目录名称 "refence"
reference_dir = base_dir / "refence" / dataset_name
if not reference_dir.exists():
logger.warning(f"Directory {reference_dir} does not exist")
return results
# 查找参考分子的SDF文件
reference_sdf_files = list(reference_dir.glob("*_out_converted.sdf"))
logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}")
for sdf_file in reference_sdf_files:
smiles = get_smiles_from_sdf(sdf_file)
vina_scores = extract_vina_scores_from_sdf(sdf_file)
if smiles is not None:
try:
mol = Chem.MolFromSmiles(smiles)
if mol is not None:
qed_value = QED.qed(mol)
mol_weight = MolWt(mol)
results.append({
'smiles': smiles,
'filename': sdf_file.name,
'qed': qed_value,
'molecular_weight': mol_weight,
'vina_scores': str(vina_scores) # 添加Vina得分列表
})
except Exception as e:
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
return results
def process_dataset(result_dir, dataset_name):
"""
Process a single dataset (fgbar or trpe) and save to a separate CSV file
Args:
result_dir (Path): Base result directory
dataset_name (str): Name of the dataset (fgbar or trpe)
"""
# Process poses_all SDF files
poses_results = calculate_qed_for_poses_all(result_dir, dataset_name)
# Process reference SDF files
reference_results = calculate_qed_for_reference(result_dir, dataset_name)
# Combine results
all_results = poses_results + reference_results
# Create DataFrame and save to CSV
if all_results:
df = pd.DataFrame(all_results)
csv_filename = f"qed_values_{dataset_name}.csv"
df.to_csv(csv_filename, index=False)
logger.info(f"Saved {len(df)} QED values to {csv_filename}")
print(f"First few rows of {csv_filename}:")
print(df.head())
else:
logger.warning(f"No QED values were calculated for {dataset_name}")
# Create empty CSV with headers
df = pd.DataFrame(columns=['smiles', 'filename', 'qed', 'molecular_weight', 'vina_scores'])
csv_filename = f"qed_values_{dataset_name}.csv"
df.to_csv(csv_filename, index=False)
def main():
"""
Main function to calculate QED values for all molecules
"""
# Define base directories
result_dir = Path("result")
# Process both datasets (fgbar and trpe) separately
datasets = ["fgbar", "trpe"]
for dataset in datasets:
process_dataset(result_dir, dataset)
if __name__ == "__main__":
main()