228 lines
7.7 KiB
Python
228 lines
7.7 KiB
Python
#!/usr/bin/env python
|
|
# -*- encoding: utf-8 -*-
|
|
'''
|
|
@file :calculate_qed_values.py
|
|
@Description :Calculate QED values for molecules in poses_all directories and reference molecules
|
|
@Date :2025/08/04
|
|
@Author :lyzeng
|
|
'''
|
|
|
|
import pandas as pd
|
|
from rdkit import Chem
|
|
from rdkit.Chem import QED
|
|
from rdkit.Chem.Descriptors import MolWt
|
|
from pathlib import Path
|
|
import logging
|
|
import json
|
|
|
|
# Setup logging
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def get_smiles_from_sdf(sdf_file_path):
|
|
"""
|
|
Extract SMILES from the first molecule in an SDF file
|
|
|
|
Args:
|
|
sdf_file_path (Path): Path to the SDF file
|
|
|
|
Returns:
|
|
str: SMILES representation of the first molecule or None if failed
|
|
"""
|
|
try:
|
|
supplier = Chem.SDMolSupplier(str(sdf_file_path))
|
|
mol = next(supplier) # Get the first molecule
|
|
if mol is not None:
|
|
smiles = Chem.MolToSmiles(mol)
|
|
return smiles
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract SMILES from {sdf_file_path}: {e}")
|
|
return None
|
|
|
|
def get_smiles_from_mol2(mol2_file_path):
|
|
"""
|
|
Extract SMILES from a mol2 file
|
|
|
|
Args:
|
|
mol2_file_path (Path): Path to the mol2 file
|
|
|
|
Returns:
|
|
str: SMILES representation of the molecule or None if failed
|
|
"""
|
|
try:
|
|
mol = Chem.MolFromMol2File(str(mol2_file_path))
|
|
if mol is not None:
|
|
smiles = Chem.MolToSmiles(mol)
|
|
return smiles
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract SMILES from {mol2_file_path}: {e}")
|
|
return None
|
|
|
|
def extract_vina_scores_from_sdf(sdf_file_path):
|
|
"""
|
|
Extract Vina scores from all conformers in an SDF file
|
|
|
|
Args:
|
|
sdf_file_path (Path): Path to the SDF file
|
|
|
|
Returns:
|
|
list: List of Vina scores (free_energy values) or empty list if failed
|
|
"""
|
|
scores = []
|
|
try:
|
|
supplier = Chem.SDMolSupplier(str(sdf_file_path), removeHs=False)
|
|
for mol in supplier:
|
|
if mol is None:
|
|
continue
|
|
|
|
# Get the meeko property which contains docking information
|
|
if mol.HasProp("meeko"):
|
|
meeko_raw = mol.GetProp("meeko")
|
|
try:
|
|
meeko_dict = json.loads(meeko_raw)
|
|
# Extract free energy (Vina score)
|
|
if 'free_energy' in meeko_dict:
|
|
scores.append(meeko_dict['free_energy'])
|
|
except json.JSONDecodeError:
|
|
logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}")
|
|
else:
|
|
logger.warning(f"No meeko property found in molecule from {sdf_file_path}")
|
|
except Exception as e:
|
|
logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}")
|
|
|
|
return scores
|
|
|
|
def calculate_qed_for_poses_all(base_dir, dataset_name):
|
|
"""
|
|
Calculate QED values for all SDF files in poses_all directory
|
|
|
|
Args:
|
|
base_dir (Path): Base directory containing the poses_all folder
|
|
dataset_name (str): Name of the dataset (fgbar or trpe)
|
|
|
|
Returns:
|
|
list: List of dictionaries with smiles, filename, qed, and molecular weight values
|
|
"""
|
|
results = []
|
|
poses_all_dir = base_dir / dataset_name / "poses_all"
|
|
|
|
if not poses_all_dir.exists():
|
|
logger.warning(f"Directory {poses_all_dir} does not exist")
|
|
return results
|
|
|
|
sdf_files = list(poses_all_dir.glob("*.sdf"))
|
|
logger.info(f"Processing {len(sdf_files)} SDF files in {poses_all_dir}")
|
|
|
|
for sdf_file in sdf_files:
|
|
smiles = get_smiles_from_sdf(sdf_file)
|
|
vina_scores = extract_vina_scores_from_sdf(sdf_file)
|
|
|
|
if smiles is not None:
|
|
try:
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
if mol is not None:
|
|
qed_value = QED.qed(mol)
|
|
mol_weight = MolWt(mol)
|
|
results.append({
|
|
'smiles': smiles,
|
|
'filename': sdf_file.name,
|
|
'qed': qed_value,
|
|
'molecular_weight': mol_weight,
|
|
'vina_scores': vina_scores # 添加Vina得分列表
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
|
|
|
|
return results
|
|
|
|
def calculate_qed_for_reference(base_dir, dataset_name):
|
|
"""
|
|
Calculate QED values for reference molecules in SDF format
|
|
|
|
Args:
|
|
base_dir (Path): Base directory containing the reference folder
|
|
dataset_name (str): Name of the dataset (fgbar or trpe)
|
|
|
|
Returns:
|
|
list: List of dictionaries with smiles, filename, qed, molecular weight, and vina scores values
|
|
"""
|
|
results = []
|
|
# 使用原始目录名称 "refence"
|
|
reference_dir = base_dir / "refence" / dataset_name
|
|
|
|
if not reference_dir.exists():
|
|
logger.warning(f"Directory {reference_dir} does not exist")
|
|
return results
|
|
|
|
# 查找参考分子的SDF文件
|
|
reference_sdf_files = list(reference_dir.glob("*_out_converted.sdf"))
|
|
logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}")
|
|
|
|
for sdf_file in reference_sdf_files:
|
|
smiles = get_smiles_from_sdf(sdf_file)
|
|
vina_scores = extract_vina_scores_from_sdf(sdf_file)
|
|
|
|
if smiles is not None:
|
|
try:
|
|
mol = Chem.MolFromSmiles(smiles)
|
|
if mol is not None:
|
|
qed_value = QED.qed(mol)
|
|
mol_weight = MolWt(mol)
|
|
results.append({
|
|
'smiles': smiles,
|
|
'filename': sdf_file.name,
|
|
'qed': qed_value,
|
|
'molecular_weight': mol_weight,
|
|
'vina_scores': str(vina_scores) # 添加Vina得分列表
|
|
})
|
|
except Exception as e:
|
|
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
|
|
|
|
return results
|
|
|
|
def process_dataset(result_dir, dataset_name):
|
|
"""
|
|
Process a single dataset (fgbar or trpe) and save to a separate CSV file
|
|
|
|
Args:
|
|
result_dir (Path): Base result directory
|
|
dataset_name (str): Name of the dataset (fgbar or trpe)
|
|
"""
|
|
# Process poses_all SDF files
|
|
poses_results = calculate_qed_for_poses_all(result_dir, dataset_name)
|
|
|
|
# Process reference SDF files
|
|
reference_results = calculate_qed_for_reference(result_dir, dataset_name)
|
|
|
|
# Combine results
|
|
all_results = poses_results + reference_results
|
|
|
|
# Create DataFrame and save to CSV
|
|
if all_results:
|
|
df = pd.DataFrame(all_results)
|
|
csv_filename = f"qed_values_{dataset_name}.csv"
|
|
df.to_csv(csv_filename, index=False)
|
|
logger.info(f"Saved {len(df)} QED values to {csv_filename}")
|
|
print(f"First few rows of {csv_filename}:")
|
|
print(df.head())
|
|
else:
|
|
logger.warning(f"No QED values were calculated for {dataset_name}")
|
|
# Create empty CSV with headers
|
|
df = pd.DataFrame(columns=['smiles', 'filename', 'qed', 'molecular_weight', 'vina_scores'])
|
|
csv_filename = f"qed_values_{dataset_name}.csv"
|
|
df.to_csv(csv_filename, index=False)
|
|
|
|
def main():
|
|
"""
|
|
Main function to calculate QED values for all molecules
|
|
"""
|
|
# Define base directories
|
|
result_dir = Path("result")
|
|
|
|
# Process both datasets (fgbar and trpe) separately
|
|
datasets = ["fgbar", "trpe"]
|
|
for dataset in datasets:
|
|
process_dataset(result_dir, dataset)
|
|
|
|
if __name__ == "__main__":
|
|
main() |