#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @file :qsar.py @Description: : unimol and 1D,2D,3D qsar @Date :2024/09/28 10:12:51 @Author :lyzeng @Email :pylyzeng@gmail.com @version :1.0 ''' # QSAR https://bohrium.dp.tech/notebooks/1032 # install unimol https://unimol.readthedocs.io/en/latest/installation.html#install import pandas as pd if __name__ == '__main__': df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv',sep=';') df_with_MIC = df[df['Standard Type'] == 'MIC'] # 选择关键的活性和结构数据 qsar_df = df_with_MIC[['Molecule ChEMBL ID', 'Smiles', 'Standard Value', 'Standard Units', 'AlogP', 'Molecular Weight', '#RO5 Violations', 'Ligand Efficiency LE', 'Target Name', 'Target ChEMBL ID']] # prepare data import pandas as pd # Assuming qsar_df is your original DataFrame qsar_df_formatted = qsar_df[['Smiles', 'Standard Value']].copy() # Rename the 'Standard Value' column to 'TARGET' qsar_df_formatted.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True) # Now qsar_df_formatted is ready for training print(qsar_df_formatted.head()) qsar_df_formatted.to_csv('qsar_training_data.csv', index=False) # split data import pandas as pd from sklearn.model_selection import train_test_split # Load the dataset qsar_df = pd.read_csv('qsar_training_data.csv') # Split the data: 90% for training, 10% for testing train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42) # Save the resulting datasets to CSV files train_df.to_csv('qsar_training_data.csv', index=False) test_df.to_csv('qsar_test_data.csv', index=False) # Output the size of the datasets to verify print(f"Training set size: {train_df.shape[0]} samples") print(f"Test set size: {test_df.shape[0]} samples") # train and test from unimol_tools import MolTrain, MolPredict # Assuming you saved the DataFrame to 'qsar_training_data.csv' train_data = 'qsar_training_data.csv' # Train the model clf = MolTrain(task='regression', # or 'classification' based on your needs data_type='molecule', epochs=10, batch_size=16, metrics='mse', # Use 'mse' for regression save_path='./exp' ) pred = clf.fit(data=train_data) ## download mol.dict.txt ## python script :/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights # Making predictions test_data = 'qsar_test_data.csv' # Replace with your actual test data file clf = MolPredict(load_model='./exp') res = clf.predict(data=test_data)