Files
qsar/MIC/qsar_3D_predict.py
2024-09-28 13:14:52 +08:00

119 lines
4.2 KiB
Python

import joblib
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdPartialCharges
from pathlib import Path
from pprint import pprint
# Function to calculate 3D-QSAR descriptors using Coulomb Matrix
def calculate_3dqsar_repr(SMILES, max_atoms=100, three_d=False):
mol = Chem.MolFromSmiles(SMILES)
mol = Chem.AddHs(mol)
if three_d:
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
else:
AllChem.Compute2DCoords(mol)
natoms = mol.GetNumAtoms()
rdPartialCharges.ComputeGasteigerCharges(mol)
charges = np.array([float(atom.GetProp("_GasteigerCharge")) for atom in mol.GetAtoms()])
coords = mol.GetConformer().GetPositions()
coulomb_matrix = np.zeros((max_atoms, max_atoms))
n = min(max_atoms, natoms)
for i in range(n):
for j in range(i, n):
if i == j:
coulomb_matrix[i, j] = 0.5 * charges[i] ** 2
if i != j:
delta = np.linalg.norm(coords[i] - coords[j])
if delta != 0:
coulomb_matrix[i, j] = charges[i] * charges[j] / delta
coulomb_matrix[j, i] = coulomb_matrix[i, j]
coulomb_matrix = np.where(np.isinf(coulomb_matrix), 0, coulomb_matrix)
coulomb_matrix = np.where(np.isnan(coulomb_matrix), 0, coulomb_matrix)
return coulomb_matrix.reshape(max_atoms*max_atoms).tolist()
# # Load the SDF file and convert to SMILES
# sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf'
# supplier = Chem.SDMolSupplier(sdf_file)
# new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF
# smiles = Chem.MolToSmiles(new_mol)
# # Calculate the 3D-QSAR descriptors
# descriptor = calculate_3dqsar_repr(smiles)
# descriptor_array = np.array(descriptor).reshape(1, -1)
# # Load the saved model (use the model that performed best in training)
# model_filename = "3d_qsar_random_forest_model.pkl"
# model = joblib.load(model_filename)
# # Predict the MIC value
# predicted_mic = model.predict(descriptor_array)
# print(f"Predicted MIC value: {predicted_mic[0]}")
# Load the SDF file and convert to SMILES
sdf_file_list = [i for i in Path('../predict_data').glob('*.sdf')]
# sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf'
sdf_results = {}
for sdf_file in sdf_file_list:
supplier = Chem.SDMolSupplier(sdf_file)
new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF
smiles = Chem.MolToSmiles(new_mol)
# Calculate the 3D-QSAR descriptors
descriptor = calculate_3dqsar_repr(smiles)
descriptor_array = np.array(descriptor).reshape(1, -1)
# Load the saved model (use the model that performed best in training)
model_file_list = [i for i in Path().cwd().glob('3d_qsar_*.pkl')]
results = {}
for model_file in model_file_list:
model = joblib.load(model_file)
# Predict the MIC value
predicted_mic = model.predict(descriptor_array)
# print(f"Predicted MIC value: {predicted_mic[0]}")
results[model_file.stem] = predicted_mic[0]
sdf_results[sdf_file.stem] = results
pprint(sdf_results)
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Filter out negative MIC values from sdf_results
filtered_sdf_results = {}
for sdf_name, model_results in sdf_results.items():
filtered_results = {model_name: mic_value for model_name, mic_value in model_results.items() if mic_value >= 0}
filtered_sdf_results[sdf_name] = filtered_results
pprint(filtered_sdf_results)
# Convert the filtered results to a DataFrame for easier plotting
filtered_data = []
for sdf_name, model_results in filtered_sdf_results.items():
for model_name, mic_value in model_results.items():
filtered_data.append({'SDF': sdf_name, 'Model': model_name, 'MIC': mic_value})
df = pd.DataFrame(filtered_data)
# Set up the matplotlib figure
plt.figure(figsize=(12, 8))
# Create a seaborn barplot with the filtered data
sns.barplot(x='SDF', y='MIC', hue='Model', data=df, palette='tab20')
# Customize the plot
plt.title('Predicted MIC values by Model for Each SDF (Filtered)')
plt.xlabel('SDF Files')
plt.ylabel('Predicted MIC Values')
plt.xticks(rotation=45)
plt.legend(title='Model')
# Show the plot
plt.tight_layout()
plt.show()