#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :calculate_qed_values.py @Description :Calculate QED values for molecules in poses_all directories and reference molecules @Date :2025/08/04 @Author :lyzeng ''' import pandas as pd from rdkit import Chem from rdkit.Chem import QED from rdkit.Chem.Descriptors import MolWt from pathlib import Path import logging import json # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def get_smiles_from_sdf(sdf_file_path): """ Extract SMILES from the first molecule in an SDF file Args: sdf_file_path (Path): Path to the SDF file Returns: str: SMILES representation of the first molecule or None if failed """ try: supplier = Chem.SDMolSupplier(str(sdf_file_path)) mol = next(supplier) # Get the first molecule if mol is not None: smiles = Chem.MolToSmiles(mol) return smiles except Exception as e: logger.warning(f"Failed to extract SMILES from {sdf_file_path}: {e}") return None def get_smiles_from_mol2(mol2_file_path): """ Extract SMILES from a mol2 file Args: mol2_file_path (Path): Path to the mol2 file Returns: str: SMILES representation of the molecule or None if failed """ try: mol = Chem.MolFromMol2File(str(mol2_file_path)) if mol is not None: smiles = Chem.MolToSmiles(mol) return smiles except Exception as e: logger.warning(f"Failed to extract SMILES from {mol2_file_path}: {e}") return None def extract_vina_scores_from_sdf(sdf_file_path): """ Extract Vina scores from all conformers in an SDF file Args: sdf_file_path (Path): Path to the SDF file Returns: list: List of Vina scores (free_energy values) or empty list if failed """ scores = [] try: supplier = Chem.SDMolSupplier(str(sdf_file_path), removeHs=False) for mol in supplier: if mol is None: continue # Get the meeko property which contains docking information if mol.HasProp("meeko"): meeko_raw = mol.GetProp("meeko") try: meeko_dict = json.loads(meeko_raw) # Extract free energy (Vina score) if 'free_energy' in meeko_dict: scores.append(meeko_dict['free_energy']) except json.JSONDecodeError: logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}") else: logger.warning(f"No meeko property found in molecule from {sdf_file_path}") except Exception as e: logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}") return scores def calculate_qed_for_poses_all(base_dir, dataset_name): """ Calculate QED values for all SDF files in poses_all directory Args: base_dir (Path): Base directory containing the poses_all folder dataset_name (str): Name of the dataset (fgbar or trpe) Returns: list: List of dictionaries with smiles, filename, qed, and molecular weight values """ results = [] poses_all_dir = base_dir / dataset_name / "poses_all" if not poses_all_dir.exists(): logger.warning(f"Directory {poses_all_dir} does not exist") return results sdf_files = list(poses_all_dir.glob("*.sdf")) logger.info(f"Processing {len(sdf_files)} SDF files in {poses_all_dir}") for sdf_file in sdf_files: smiles = get_smiles_from_sdf(sdf_file) vina_scores = extract_vina_scores_from_sdf(sdf_file) if smiles is not None: try: mol = Chem.MolFromSmiles(smiles) if mol is not None: qed_value = QED.qed(mol) mol_weight = MolWt(mol) results.append({ 'smiles': smiles, 'filename': sdf_file.name, 'qed': qed_value, 'molecular_weight': mol_weight, 'vina_scores': vina_scores # 添加Vina得分列表 }) except Exception as e: logger.warning(f"Failed to calculate QED for {sdf_file}: {e}") return results def calculate_qed_for_reference(base_dir, dataset_name): """ Calculate QED values for reference molecules in SDF format Args: base_dir (Path): Base directory containing the reference folder dataset_name (str): Name of the dataset (fgbar or trpe) Returns: list: List of dictionaries with smiles, filename, qed, molecular weight, and vina scores values """ results = [] # 使用原始目录名称 "refence" reference_dir = base_dir / "refence" / dataset_name if not reference_dir.exists(): logger.warning(f"Directory {reference_dir} does not exist") return results # 查找参考分子的SDF文件 reference_sdf_files = list(reference_dir.glob("*_out_converted.sdf")) logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}") for sdf_file in reference_sdf_files: smiles = get_smiles_from_sdf(sdf_file) vina_scores = extract_vina_scores_from_sdf(sdf_file) if smiles is not None: try: mol = Chem.MolFromSmiles(smiles) if mol is not None: qed_value = QED.qed(mol) mol_weight = MolWt(mol) results.append({ 'smiles': smiles, 'filename': sdf_file.name, 'qed': qed_value, 'molecular_weight': mol_weight, 'vina_scores': str(vina_scores) # 添加Vina得分列表 }) except Exception as e: logger.warning(f"Failed to calculate QED for {sdf_file}: {e}") return results def process_dataset(result_dir, dataset_name): """ Process a single dataset (fgbar or trpe) and save to a separate CSV file Args: result_dir (Path): Base result directory dataset_name (str): Name of the dataset (fgbar or trpe) """ # Process poses_all SDF files poses_results = calculate_qed_for_poses_all(result_dir, dataset_name) # Process reference SDF files reference_results = calculate_qed_for_reference(result_dir, dataset_name) # Combine results all_results = poses_results + reference_results # Create DataFrame and save to CSV if all_results: df = pd.DataFrame(all_results) csv_filename = f"qed_values_{dataset_name}.csv" df.to_csv(csv_filename, index=False) logger.info(f"Saved {len(df)} QED values to {csv_filename}") print(f"First few rows of {csv_filename}:") print(df.head()) else: logger.warning(f"No QED values were calculated for {dataset_name}") # Create empty CSV with headers df = pd.DataFrame(columns=['smiles', 'filename', 'qed', 'molecular_weight', 'vina_scores']) csv_filename = f"qed_values_{dataset_name}.csv" df.to_csv(csv_filename, index=False) def main(): """ Main function to calculate QED values for all molecules """ # Define base directories result_dir = Path("result") # Process both datasets (fgbar and trpe) separately datasets = ["fgbar", "trpe"] for dataset in datasets: process_dataset(result_dir, dataset) if __name__ == "__main__": main()