#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :qsar_1d.py @Description: : 1D-QSAR modeling with SMILES and MIC data @Date :2024/09/28 @Author :lyzeng @Email :pylyzeng@gmail.com @version :1.0 ''' import pandas as pd import numpy as np from rdkit import Chem from rdkit.Chem import Descriptors from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, SGDRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.neural_network import MLPRegressor from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import joblib import os # Function to calculate 1D-QSAR descriptors def calculate_1dqsar_repr(smiles): mol = Chem.MolFromSmiles(smiles) if mol is None: return None descriptors = [ Descriptors.MolWt(mol), Descriptors.MolLogP(mol), Descriptors.NumHDonors(mol), Descriptors.NumHAcceptors(mol), Descriptors.TPSA(mol), Descriptors.NumRotatableBonds(mol), Descriptors.NumAromaticRings(mol), Descriptors.NumAliphaticRings(mol), Descriptors.NumSaturatedRings(mol), Descriptors.NumHeteroatoms(mol), Descriptors.NumValenceElectrons(mol), Descriptors.NumRadicalElectrons(mol), Descriptors.qed(mol) ] return descriptors if __name__ == '__main__': # Load your dataset df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv', sep=';') df_with_MIC = df[df['Standard Type'] == 'MIC'] # Select relevant columns qsar_df = df_with_MIC[['Smiles', 'Standard Value']].copy() qsar_df.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True) # Calculate 1D-QSAR descriptors for each molecule qsar_df['1dqsar_mr'] = qsar_df['SMILES'].apply(calculate_1dqsar_repr) # Remove rows where descriptor calculation failed qsar_df = qsar_df.dropna() # Split the data into training and testing sets train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42) # Convert the data to NumPy arrays train_x = np.array(train_df['1dqsar_mr'].tolist()) train_y = np.array(train_df['TARGET'].tolist()) test_x = np.array(test_df['1dqsar_mr'].tolist()) test_y = np.array(test_df['TARGET'].tolist()) # Define regressors to use regressors = [ ("Linear Regression", LinearRegression()), ("Stochastic Gradient Descent", SGDRegressor(random_state=42)), ("K-Nearest Neighbors", KNeighborsRegressor()), ("Decision Tree", DecisionTreeRegressor(random_state=42)), ("Random Forest", RandomForestRegressor(random_state=42)), ("XGBoost", XGBRegressor(eval_metric="rmse", random_state=42)), ("Multi-layer Perceptron", MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=10000, random_state=42)), ] # Initialize results dictionary results = {} # Train and evaluate each regressor for name, regressor in regressors: regressor.fit(train_x, train_y) pred_y = regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) r2 = r2_score(test_y, pred_y) results[f"1D-QSAR-{name}"] = {"MSE": mse, "R2": r2} print(f"[1D-QSAR][{name}]\tMSE:{mse:.4f}\tR2:{r2:.4f}") # Save the trained model model_filename = f"1d_qsar_{name.replace(' ', '_').lower()}_model.pkl" joblib.dump(regressor, os.path.join(os.getcwd(), model_filename)) print(f"Model saved to {model_filename}") # Sort results by R2 score sorted_results = sorted(results.items(), key=lambda x: x[1]["R2"], reverse=True) # Plot results (optional, for example purposes) plt.figure(figsize=(10, 6), dpi=300) plt.title("R2 Scores for Regressors") plt.barh([name for name, _ in sorted_results], [result['R2'] for _, result in sorted_results]) plt.xlabel("R2 Score") plt.savefig('qsar_1D_r2_scores.png', dpi=300)