update

2024-09-28 13:14:52 +08:00
parent 2f139ec93d
commit af43b65d34
18 changed files with 432026 additions and 79 deletions
--- a/MIC/qsar_1D.py
+++ b/MIC/qsar_1D.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :qsar_1d.py
+@Description:       : 1D-QSAR modeling with SMILES and MIC data
+@Date               :2024/09/28
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+'''
+
+import pandas as pd
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression, SGDRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.neural_network import MLPRegressor
+from xgboost import XGBRegressor
+from sklearn.metrics import mean_squared_error, r2_score
+import matplotlib.pyplot as plt
+import joblib
+import os
+
+# Function to calculate 1D-QSAR descriptors
+def calculate_1dqsar_repr(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        return None
+    descriptors = [
+        Descriptors.MolWt(mol),
+        Descriptors.MolLogP(mol),
+        Descriptors.NumHDonors(mol),
+        Descriptors.NumHAcceptors(mol),
+        Descriptors.TPSA(mol),
+        Descriptors.NumRotatableBonds(mol),
+        Descriptors.NumAromaticRings(mol),
+        Descriptors.NumAliphaticRings(mol),
+        Descriptors.NumSaturatedRings(mol),
+        Descriptors.NumHeteroatoms(mol),
+        Descriptors.NumValenceElectrons(mol),
+        Descriptors.NumRadicalElectrons(mol),
+        Descriptors.qed(mol)
+    ]
+    return descriptors
+
+if __name__ == '__main__':
+    # Load your dataset
+    df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv', sep=';')
+    df_with_MIC = df[df['Standard Type'] == 'MIC']
+
+    # Select relevant columns
+    qsar_df = df_with_MIC[['Smiles', 'Standard Value']].copy()
+    qsar_df.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True)
+
+    # Calculate 1D-QSAR descriptors for each molecule
+    qsar_df['1dqsar_mr'] = qsar_df['SMILES'].apply(calculate_1dqsar_repr)
+
+    # Remove rows where descriptor calculation failed
+    qsar_df = qsar_df.dropna()
+
+    # Split the data into training and testing sets
+    train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42)
+
+    # Convert the data to NumPy arrays
+    train_x = np.array(train_df['1dqsar_mr'].tolist())
+    train_y = np.array(train_df['TARGET'].tolist())
+    test_x = np.array(test_df['1dqsar_mr'].tolist())
+    test_y = np.array(test_df['TARGET'].tolist())
+
+    # Define regressors to use
+    regressors = [
+        ("Linear Regression", LinearRegression()),
+        ("Stochastic Gradient Descent", SGDRegressor(random_state=42)),
+        ("K-Nearest Neighbors", KNeighborsRegressor()),
+        ("Decision Tree", DecisionTreeRegressor(random_state=42)),
+        ("Random Forest", RandomForestRegressor(random_state=42)),
+        ("XGBoost", XGBRegressor(eval_metric="rmse", random_state=42)),
+        ("Multi-layer Perceptron", MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam', max_iter=10000, random_state=42)),
+    ]
+
+    # Initialize results dictionary
+    results = {}
+
+    # Train and evaluate each regressor
+    for name, regressor in regressors:
+        regressor.fit(train_x, train_y)
+        pred_y = regressor.predict(test_x)
+        mse = mean_squared_error(test_y, pred_y)
+        r2 = r2_score(test_y, pred_y)
+        results[f"1D-QSAR-{name}"] = {"MSE": mse, "R2": r2}
+        print(f"[1D-QSAR][{name}]\tMSE:{mse:.4f}\tR2:{r2:.4f}")
+        
+        # Save the trained model
+        model_filename = f"1d_qsar_{name.replace(' ', '_').lower()}_model.pkl"
+        joblib.dump(regressor, os.path.join(os.getcwd(), model_filename))
+        print(f"Model saved to {model_filename}")
+
+    # Sort results by R2 score
+    sorted_results = sorted(results.items(), key=lambda x: x[1]["R2"], reverse=True)
+
+    # Plot results (optional, for example purposes)
+    plt.figure(figsize=(10, 6), dpi=300)
+    plt.title("R2 Scores for Regressors")
+    plt.barh([name for name, _ in sorted_results], [result['R2'] for _, result in sorted_results])
+    plt.xlabel("R2 Score")
+    plt.savefig('qsar_1D_r2_scores.png', dpi=300)