update
This commit is contained in:
118
MIC/qsar_3D_predict.py
Normal file
118
MIC/qsar_3D_predict.py
Normal file
@@ -0,0 +1,118 @@
|
||||
import joblib
|
||||
import numpy as np
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import AllChem
|
||||
from rdkit.Chem import rdPartialCharges
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
# Function to calculate 3D-QSAR descriptors using Coulomb Matrix
|
||||
def calculate_3dqsar_repr(SMILES, max_atoms=100, three_d=False):
|
||||
mol = Chem.MolFromSmiles(SMILES)
|
||||
mol = Chem.AddHs(mol)
|
||||
if three_d:
|
||||
AllChem.EmbedMolecule(mol, AllChem.ETKDG())
|
||||
else:
|
||||
AllChem.Compute2DCoords(mol)
|
||||
natoms = mol.GetNumAtoms()
|
||||
rdPartialCharges.ComputeGasteigerCharges(mol)
|
||||
charges = np.array([float(atom.GetProp("_GasteigerCharge")) for atom in mol.GetAtoms()])
|
||||
coords = mol.GetConformer().GetPositions()
|
||||
coulomb_matrix = np.zeros((max_atoms, max_atoms))
|
||||
n = min(max_atoms, natoms)
|
||||
for i in range(n):
|
||||
for j in range(i, n):
|
||||
if i == j:
|
||||
coulomb_matrix[i, j] = 0.5 * charges[i] ** 2
|
||||
if i != j:
|
||||
delta = np.linalg.norm(coords[i] - coords[j])
|
||||
if delta != 0:
|
||||
coulomb_matrix[i, j] = charges[i] * charges[j] / delta
|
||||
coulomb_matrix[j, i] = coulomb_matrix[i, j]
|
||||
coulomb_matrix = np.where(np.isinf(coulomb_matrix), 0, coulomb_matrix)
|
||||
coulomb_matrix = np.where(np.isnan(coulomb_matrix), 0, coulomb_matrix)
|
||||
return coulomb_matrix.reshape(max_atoms*max_atoms).tolist()
|
||||
|
||||
# # Load the SDF file and convert to SMILES
|
||||
# sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf'
|
||||
# supplier = Chem.SDMolSupplier(sdf_file)
|
||||
# new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF
|
||||
# smiles = Chem.MolToSmiles(new_mol)
|
||||
|
||||
# # Calculate the 3D-QSAR descriptors
|
||||
# descriptor = calculate_3dqsar_repr(smiles)
|
||||
# descriptor_array = np.array(descriptor).reshape(1, -1)
|
||||
|
||||
# # Load the saved model (use the model that performed best in training)
|
||||
# model_filename = "3d_qsar_random_forest_model.pkl"
|
||||
# model = joblib.load(model_filename)
|
||||
|
||||
# # Predict the MIC value
|
||||
# predicted_mic = model.predict(descriptor_array)
|
||||
# print(f"Predicted MIC value: {predicted_mic[0]}")
|
||||
|
||||
# Load the SDF file and convert to SMILES
|
||||
sdf_file_list = [i for i in Path('../predict_data').glob('*.sdf')]
|
||||
# sdf_file = '/mnt/c/project/qsar/predict_data/chem1.sdf'
|
||||
sdf_results = {}
|
||||
for sdf_file in sdf_file_list:
|
||||
supplier = Chem.SDMolSupplier(sdf_file)
|
||||
new_mol = [mol for mol in supplier][0] # Assuming only one molecule in SDF
|
||||
smiles = Chem.MolToSmiles(new_mol)
|
||||
|
||||
# Calculate the 3D-QSAR descriptors
|
||||
descriptor = calculate_3dqsar_repr(smiles)
|
||||
descriptor_array = np.array(descriptor).reshape(1, -1)
|
||||
|
||||
# Load the saved model (use the model that performed best in training)
|
||||
model_file_list = [i for i in Path().cwd().glob('3d_qsar_*.pkl')]
|
||||
|
||||
results = {}
|
||||
|
||||
for model_file in model_file_list:
|
||||
model = joblib.load(model_file)
|
||||
# Predict the MIC value
|
||||
predicted_mic = model.predict(descriptor_array)
|
||||
# print(f"Predicted MIC value: {predicted_mic[0]}")
|
||||
results[model_file.stem] = predicted_mic[0]
|
||||
|
||||
sdf_results[sdf_file.stem] = results
|
||||
|
||||
pprint(sdf_results)
|
||||
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
|
||||
# Filter out negative MIC values from sdf_results
|
||||
filtered_sdf_results = {}
|
||||
for sdf_name, model_results in sdf_results.items():
|
||||
filtered_results = {model_name: mic_value for model_name, mic_value in model_results.items() if mic_value >= 0}
|
||||
filtered_sdf_results[sdf_name] = filtered_results
|
||||
|
||||
pprint(filtered_sdf_results)
|
||||
|
||||
# Convert the filtered results to a DataFrame for easier plotting
|
||||
filtered_data = []
|
||||
for sdf_name, model_results in filtered_sdf_results.items():
|
||||
for model_name, mic_value in model_results.items():
|
||||
filtered_data.append({'SDF': sdf_name, 'Model': model_name, 'MIC': mic_value})
|
||||
|
||||
df = pd.DataFrame(filtered_data)
|
||||
|
||||
# Set up the matplotlib figure
|
||||
plt.figure(figsize=(12, 8))
|
||||
|
||||
# Create a seaborn barplot with the filtered data
|
||||
sns.barplot(x='SDF', y='MIC', hue='Model', data=df, palette='tab20')
|
||||
|
||||
# Customize the plot
|
||||
plt.title('Predicted MIC values by Model for Each SDF (Filtered)')
|
||||
plt.xlabel('SDF Files')
|
||||
plt.ylabel('Predicted MIC Values')
|
||||
plt.xticks(rotation=45)
|
||||
plt.legend(title='Model')
|
||||
|
||||
# Show the plot
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user