import joblib import numpy as np from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import rdPartialCharges from pathlib import Path from pprint import pprint # Function to calculate 3D-QSAR descriptors using Coulomb Matrix def calculate_3dqsar_repr(SMILES, max_atoms=100, three_d=False): mol = Chem.MolFromSmiles(SMILES) mol = Chem.AddHs(mol) if three_d: AllChem.EmbedMolecule(mol, AllChem.ETKDG()) else: AllChem.Compute2DCoords(mol) natoms = mol.GetNumAtoms() rdPartialCharges.ComputeGasteigerCharges(mol) charges = np.array([float(atom.GetProp("_GasteigerCharge")) for atom in mol.GetAtoms()]) coords = mol.GetConformer().GetPositions() coulomb_matrix = np.zeros((max_atoms, max_atoms)) n = min(max_atoms, natoms) for i in range(n): for j in range(i, n): if i == j: coulomb_matrix[i, j] = 0.5 * charges[i] ** 2 if i != j: delta = np.linalg.norm(coords[i] - coords[j]) if delta != 0: coulomb_matrix[i, j] = charges[i] * charges[j] / delta coulomb_matrix[j, i] = coulomb_matrix[i, j] coulomb_matrix = np.where(np.isinf(coulomb_matrix), 0, coulomb_matrix) coulomb_matrix = np.where(np.isnan(coulomb_matrix), 0, coulomb_matrix) return coulomb_matrix.reshape(max_atoms*max_atoms).tolist() # # Load the SDF file and convert to SMILES # sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf' # supplier = Chem.SDMolSupplier(sdf_file) # new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF # smiles = Chem.MolToSmiles(new_mol) # # Calculate the 3D-QSAR descriptors # descriptor = calculate_3dqsar_repr(smiles) # descriptor_array = np.array(descriptor).reshape(1, -1) # # Load the saved model (use the model that performed best in training) # model_filename = "3d_qsar_random_forest_model.pkl" # model = joblib.load(model_filename) # # Predict the MIC value # predicted_mic = model.predict(descriptor_array) # print(f"Predicted MIC value: {predicted_mic[0]}") # Load the SDF file and convert to SMILES sdf_file_list = [i for i in Path('../predict_data').glob('*.sdf')] # sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf' sdf_results = {} for sdf_file in sdf_file_list: supplier = Chem.SDMolSupplier(sdf_file) new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF smiles = Chem.MolToSmiles(new_mol) # Calculate the 3D-QSAR descriptors descriptor = calculate_3dqsar_repr(smiles) descriptor_array = np.array(descriptor).reshape(1, -1) # Load the saved model (use the model that performed best in training) model_file_list = [i for i in Path().cwd().glob('3d_qsar_*.pkl')] results = {} for model_file in model_file_list: model = joblib.load(model_file) # Predict the MIC value predicted_mic = model.predict(descriptor_array) # print(f"Predicted MIC value: {predicted_mic[0]}") results[model_file.stem] = predicted_mic[0] sdf_results[sdf_file.stem] = results pprint(sdf_results) import seaborn as sns import matplotlib.pyplot as plt import pandas as pd # Filter out negative MIC values from sdf_results filtered_sdf_results = {} for sdf_name, model_results in sdf_results.items(): filtered_results = {model_name: mic_value for model_name, mic_value in model_results.items() if mic_value >= 0} filtered_sdf_results[sdf_name] = filtered_results pprint(filtered_sdf_results) # Convert the filtered results to a DataFrame for easier plotting filtered_data = [] for sdf_name, model_results in filtered_sdf_results.items(): for model_name, mic_value in model_results.items(): filtered_data.append({'SDF': sdf_name, 'Model': model_name, 'MIC': mic_value}) df = pd.DataFrame(filtered_data) # Set up the matplotlib figure plt.figure(figsize=(12, 8)) # Create a seaborn barplot with the filtered data sns.barplot(x='SDF', y='MIC', hue='Model', data=df, palette='tab20') # Customize the plot plt.title('Predicted MIC values by Model for Each SDF (Filtered)') plt.xlabel('SDF Files') plt.ylabel('Predicted MIC Values') plt.xticks(rotation=45) plt.legend(title='Model') # Show the plot plt.tight_layout() plt.show()