update

2024-09-28 13:14:52 +08:00
parent 2f139ec93d
commit af43b65d34
18 changed files with 432026 additions and 79 deletions
--- a/qsar.py
+++ b/qsar.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :qsar.py
+@Description:       : unimol and 1D,2D,3D qsar
+@Date               :2024/09/28 10:12:51
+@Author             :lyzeng
+@Email              :pylyzeng@gmail.com
+@version            :1.0
+'''
+
+# QSAR https://bohrium.dp.tech/notebooks/1032
+# install unimol https://unimol.readthedocs.io/en/latest/installation.html#install
+import pandas as pd
+
+
+
+if __name__ == '__main__':
+    df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv',sep=';')
+    df_with_MIC = df[df['Standard Type'] == 'MIC']
+    # 选择关键的活性和结构数据
+    qsar_df = df_with_MIC[['Molecule ChEMBL ID', 'Smiles', 'Standard Value', 'Standard Units', 
+                        'AlogP', 'Molecular Weight', '#RO5 Violations', 'Ligand Efficiency LE',
+                        'Target Name', 'Target ChEMBL ID']]
+    # prepare data
+    import pandas as pd
+
+    # Assuming qsar_df is your original DataFrame
+    qsar_df_formatted = qsar_df[['Smiles', 'Standard Value']].copy()
+
+    # Rename the 'Standard Value' column to 'TARGET'
+    qsar_df_formatted.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True)
+
+    # Now qsar_df_formatted is ready for training
+    print(qsar_df_formatted.head())
+
+    qsar_df_formatted.to_csv('qsar_training_data.csv', index=False)
+
+    # split data
+
+    import pandas as pd
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset
+    qsar_df = pd.read_csv('qsar_training_data.csv')
+
+    # Split the data: 90% for training, 10% for testing
+    train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42)
+
+    # Save the resulting datasets to CSV files
+    train_df.to_csv('qsar_training_data.csv', index=False)
+    test_df.to_csv('qsar_test_data.csv', index=False)
+
+    # Output the size of the datasets to verify
+    print(f"Training set size: {train_df.shape[0]} samples")
+    print(f"Test set size: {test_df.shape[0]} samples")
+
+    # train and test
+
+    from unimol_tools import MolTrain, MolPredict
+
+    # Assuming you saved the DataFrame to 'qsar_training_data.csv'
+    train_data = 'qsar_training_data.csv'
+
+    # Train the model
+    clf = MolTrain(task='regression',  # or 'classification' based on your needs
+                data_type='molecule', 
+                epochs=10, 
+                batch_size=16, 
+                metrics='mse',  # Use 'mse' for regression
+                save_path='./exp'
+                )
+    
+    pred = clf.fit(data=train_data)
+
+    ## download mol.dict.txt
+    ## python script :/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights
+    # Making predictions
+    test_data = 'qsar_test_data.csv'  # Replace with your actual test data file
+    clf = MolPredict(load_model='./exp')
+    res = clf.predict(data=test_data)