update
This commit is contained in:
228
scripts/calculate_qed_values.py
Normal file
228
scripts/calculate_qed_values.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :calculate_qed_values.py
|
||||
@Description :Calculate QED values for molecules in poses_all directories and reference molecules
|
||||
@Date :2025/08/04
|
||||
@Author :lyzeng
|
||||
'''
|
||||
|
||||
import pandas as pd
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import QED
|
||||
from rdkit.Chem.Descriptors import MolWt
|
||||
from pathlib import Path
|
||||
import logging
|
||||
import json
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_smiles_from_sdf(sdf_file_path):
|
||||
"""
|
||||
Extract SMILES from the first molecule in an SDF file
|
||||
|
||||
Args:
|
||||
sdf_file_path (Path): Path to the SDF file
|
||||
|
||||
Returns:
|
||||
str: SMILES representation of the first molecule or None if failed
|
||||
"""
|
||||
try:
|
||||
supplier = Chem.SDMolSupplier(str(sdf_file_path))
|
||||
mol = next(supplier) # Get the first molecule
|
||||
if mol is not None:
|
||||
smiles = Chem.MolToSmiles(mol)
|
||||
return smiles
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract SMILES from {sdf_file_path}: {e}")
|
||||
return None
|
||||
|
||||
def get_smiles_from_mol2(mol2_file_path):
|
||||
"""
|
||||
Extract SMILES from a mol2 file
|
||||
|
||||
Args:
|
||||
mol2_file_path (Path): Path to the mol2 file
|
||||
|
||||
Returns:
|
||||
str: SMILES representation of the molecule or None if failed
|
||||
"""
|
||||
try:
|
||||
mol = Chem.MolFromMol2File(str(mol2_file_path))
|
||||
if mol is not None:
|
||||
smiles = Chem.MolToSmiles(mol)
|
||||
return smiles
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract SMILES from {mol2_file_path}: {e}")
|
||||
return None
|
||||
|
||||
def extract_vina_scores_from_sdf(sdf_file_path):
|
||||
"""
|
||||
Extract Vina scores from all conformers in an SDF file
|
||||
|
||||
Args:
|
||||
sdf_file_path (Path): Path to the SDF file
|
||||
|
||||
Returns:
|
||||
list: List of Vina scores (free_energy values) or empty list if failed
|
||||
"""
|
||||
scores = []
|
||||
try:
|
||||
supplier = Chem.SDMolSupplier(str(sdf_file_path), removeHs=False)
|
||||
for mol in supplier:
|
||||
if mol is None:
|
||||
continue
|
||||
|
||||
# Get the meeko property which contains docking information
|
||||
if mol.HasProp("meeko"):
|
||||
meeko_raw = mol.GetProp("meeko")
|
||||
try:
|
||||
meeko_dict = json.loads(meeko_raw)
|
||||
# Extract free energy (Vina score)
|
||||
if 'free_energy' in meeko_dict:
|
||||
scores.append(meeko_dict['free_energy'])
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}")
|
||||
else:
|
||||
logger.warning(f"No meeko property found in molecule from {sdf_file_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}")
|
||||
|
||||
return scores
|
||||
|
||||
def calculate_qed_for_poses_all(base_dir, dataset_name):
|
||||
"""
|
||||
Calculate QED values for all SDF files in poses_all directory
|
||||
|
||||
Args:
|
||||
base_dir (Path): Base directory containing the poses_all folder
|
||||
dataset_name (str): Name of the dataset (fgbar or trpe)
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries with smiles, filename, qed, and molecular weight values
|
||||
"""
|
||||
results = []
|
||||
poses_all_dir = base_dir / dataset_name / "poses_all"
|
||||
|
||||
if not poses_all_dir.exists():
|
||||
logger.warning(f"Directory {poses_all_dir} does not exist")
|
||||
return results
|
||||
|
||||
sdf_files = list(poses_all_dir.glob("*.sdf"))
|
||||
logger.info(f"Processing {len(sdf_files)} SDF files in {poses_all_dir}")
|
||||
|
||||
for sdf_file in sdf_files:
|
||||
smiles = get_smiles_from_sdf(sdf_file)
|
||||
vina_scores = extract_vina_scores_from_sdf(sdf_file)
|
||||
|
||||
if smiles is not None:
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol is not None:
|
||||
qed_value = QED.qed(mol)
|
||||
mol_weight = MolWt(mol)
|
||||
results.append({
|
||||
'smiles': smiles,
|
||||
'filename': sdf_file.name,
|
||||
'qed': qed_value,
|
||||
'molecular_weight': mol_weight,
|
||||
'vina_scores': vina_scores # 添加Vina得分列表
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def calculate_qed_for_reference(base_dir, dataset_name):
|
||||
"""
|
||||
Calculate QED values for reference molecules in SDF format
|
||||
|
||||
Args:
|
||||
base_dir (Path): Base directory containing the reference folder
|
||||
dataset_name (str): Name of the dataset (fgbar or trpe)
|
||||
|
||||
Returns:
|
||||
list: List of dictionaries with smiles, filename, qed, molecular weight, and vina scores values
|
||||
"""
|
||||
results = []
|
||||
# 使用原始目录名称 "refence"
|
||||
reference_dir = base_dir / "refence" / dataset_name
|
||||
|
||||
if not reference_dir.exists():
|
||||
logger.warning(f"Directory {reference_dir} does not exist")
|
||||
return results
|
||||
|
||||
# 查找参考分子的SDF文件
|
||||
reference_sdf_files = list(reference_dir.glob("*_out_converted.sdf"))
|
||||
logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}")
|
||||
|
||||
for sdf_file in reference_sdf_files:
|
||||
smiles = get_smiles_from_sdf(sdf_file)
|
||||
vina_scores = extract_vina_scores_from_sdf(sdf_file)
|
||||
|
||||
if smiles is not None:
|
||||
try:
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol is not None:
|
||||
qed_value = QED.qed(mol)
|
||||
mol_weight = MolWt(mol)
|
||||
results.append({
|
||||
'smiles': smiles,
|
||||
'filename': sdf_file.name,
|
||||
'qed': qed_value,
|
||||
'molecular_weight': mol_weight,
|
||||
'vina_scores': str(vina_scores) # 添加Vina得分列表
|
||||
})
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def process_dataset(result_dir, dataset_name):
|
||||
"""
|
||||
Process a single dataset (fgbar or trpe) and save to a separate CSV file
|
||||
|
||||
Args:
|
||||
result_dir (Path): Base result directory
|
||||
dataset_name (str): Name of the dataset (fgbar or trpe)
|
||||
"""
|
||||
# Process poses_all SDF files
|
||||
poses_results = calculate_qed_for_poses_all(result_dir, dataset_name)
|
||||
|
||||
# Process reference SDF files
|
||||
reference_results = calculate_qed_for_reference(result_dir, dataset_name)
|
||||
|
||||
# Combine results
|
||||
all_results = poses_results + reference_results
|
||||
|
||||
# Create DataFrame and save to CSV
|
||||
if all_results:
|
||||
df = pd.DataFrame(all_results)
|
||||
csv_filename = f"qed_values_{dataset_name}.csv"
|
||||
df.to_csv(csv_filename, index=False)
|
||||
logger.info(f"Saved {len(df)} QED values to {csv_filename}")
|
||||
print(f"First few rows of {csv_filename}:")
|
||||
print(df.head())
|
||||
else:
|
||||
logger.warning(f"No QED values were calculated for {dataset_name}")
|
||||
# Create empty CSV with headers
|
||||
df = pd.DataFrame(columns=['smiles', 'filename', 'qed', 'molecular_weight', 'vina_scores'])
|
||||
csv_filename = f"qed_values_{dataset_name}.csv"
|
||||
df.to_csv(csv_filename, index=False)
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function to calculate QED values for all molecules
|
||||
"""
|
||||
# Define base directories
|
||||
result_dir = Path("result")
|
||||
|
||||
# Process both datasets (fgbar and trpe) separately
|
||||
datasets = ["fgbar", "trpe"]
|
||||
for dataset in datasets:
|
||||
process_dataset(result_dir, dataset)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user