update
This commit is contained in:
110
MIC/qsar_1D.py
Normal file
110
MIC/qsar_1D.py
Normal file
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :qsar_1d.py
|
||||
@Description: : 1D-QSAR modeling with SMILES and MIC data
|
||||
@Date :2024/09/28
|
||||
@Author :lyzeng
|
||||
@Email :pylyzeng@gmail.com
|
||||
@version :1.0
|
||||
'''
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from rdkit import Chem
|
||||
from rdkit.Chem import Descriptors
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LinearRegression, SGDRegressor
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.tree import DecisionTreeRegressor
|
||||
from sklearn.neural_network import MLPRegressor
|
||||
from xgboost import XGBRegressor
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
import matplotlib.pyplot as plt
|
||||
import joblib
|
||||
import os
|
||||
|
||||
# Function to calculate 1D-QSAR descriptors
|
||||
def calculate_1dqsar_repr(smiles):
|
||||
mol = Chem.MolFromSmiles(smiles)
|
||||
if mol is None:
|
||||
return None
|
||||
descriptors = [
|
||||
Descriptors.MolWt(mol),
|
||||
Descriptors.MolLogP(mol),
|
||||
Descriptors.NumHDonors(mol),
|
||||
Descriptors.NumHAcceptors(mol),
|
||||
Descriptors.TPSA(mol),
|
||||
Descriptors.NumRotatableBonds(mol),
|
||||
Descriptors.NumAromaticRings(mol),
|
||||
Descriptors.NumAliphaticRings(mol),
|
||||
Descriptors.NumSaturatedRings(mol),
|
||||
Descriptors.NumHeteroatoms(mol),
|
||||
Descriptors.NumValenceElectrons(mol),
|
||||
Descriptors.NumRadicalElectrons(mol),
|
||||
Descriptors.qed(mol)
|
||||
]
|
||||
return descriptors
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Load your dataset
|
||||
df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv', sep=';')
|
||||
df_with_MIC = df[df['Standard Type'] == 'MIC']
|
||||
|
||||
# Select relevant columns
|
||||
qsar_df = df_with_MIC[['Smiles', 'Standard Value']].copy()
|
||||
qsar_df.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True)
|
||||
|
||||
# Calculate 1D-QSAR descriptors for each molecule
|
||||
qsar_df['1dqsar_mr'] = qsar_df['SMILES'].apply(calculate_1dqsar_repr)
|
||||
|
||||
# Remove rows where descriptor calculation failed
|
||||
qsar_df = qsar_df.dropna()
|
||||
|
||||
# Split the data into training and testing sets
|
||||
train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42)
|
||||
|
||||
# Convert the data to NumPy arrays
|
||||
train_x = np.array(train_df['1dqsar_mr'].tolist())
|
||||
train_y = np.array(train_df['TARGET'].tolist())
|
||||
test_x = np.array(test_df['1dqsar_mr'].tolist())
|
||||
test_y = np.array(test_df['TARGET'].tolist())
|
||||
|
||||
# Define regressors to use
|
||||
regressors = [
|
||||
("Linear Regression", LinearRegression()),
|
||||
("Stochastic Gradient Descent", SGDRegressor(random_state=42)),
|
||||
("K-Nearest Neighbors", KNeighborsRegressor()),
|
||||
("Decision Tree", DecisionTreeRegressor(random_state=42)),
|
||||
("Random Forest", RandomForestRegressor(random_state=42)),
|
||||
("XGBoost", XGBRegressor(eval_metric="rmse", random_state=42)),
|
||||
("Multi-layer Perceptron", MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=10000, random_state=42)),
|
||||
]
|
||||
|
||||
# Initialize results dictionary
|
||||
results = {}
|
||||
|
||||
# Train and evaluate each regressor
|
||||
for name, regressor in regressors:
|
||||
regressor.fit(train_x, train_y)
|
||||
pred_y = regressor.predict(test_x)
|
||||
mse = mean_squared_error(test_y, pred_y)
|
||||
r2 = r2_score(test_y, pred_y)
|
||||
results[f"1D-QSAR-{name}"] = {"MSE": mse, "R2": r2}
|
||||
print(f"[1D-QSAR][{name}]\tMSE:{mse:.4f}\tR2:{r2:.4f}")
|
||||
|
||||
# Save the trained model
|
||||
model_filename = f"1d_qsar_{name.replace(' ', '_').lower()}_model.pkl"
|
||||
joblib.dump(regressor, os.path.join(os.getcwd(), model_filename))
|
||||
print(f"Model saved to {model_filename}")
|
||||
|
||||
# Sort results by R2 score
|
||||
sorted_results = sorted(results.items(), key=lambda x: x[1]["R2"], reverse=True)
|
||||
|
||||
# Plot results (optional, for example purposes)
|
||||
plt.figure(figsize=(10, 6), dpi=300)
|
||||
plt.title("R2 Scores for Regressors")
|
||||
plt.barh([name for name, _ in sorted_results], [result['R2'] for _, result in sorted_results])
|
||||
plt.xlabel("R2 Score")
|
||||
plt.savefig('qsar_1D_r2_scores.png', dpi=300)
|
||||
Reference in New Issue
Block a user