update
This commit is contained in:
81
qsar.py
Normal file
81
qsar.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- encoding: utf-8 -*-
|
||||
'''
|
||||
@file :qsar.py
|
||||
@Description: : unimol and 1D,2D,3D qsar
|
||||
@Date :2024/09/28 10:12:51
|
||||
@Author :lyzeng
|
||||
@Email :pylyzeng@gmail.com
|
||||
@version :1.0
|
||||
'''
|
||||
|
||||
# QSAR https://bohrium.dp.tech/notebooks/1032
|
||||
# install unimol https://unimol.readthedocs.io/en/latest/installation.html#install
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv',sep=';')
|
||||
df_with_MIC = df[df['Standard Type'] == 'MIC']
|
||||
# 选择关键的活性和结构数据
|
||||
qsar_df = df_with_MIC[['Molecule ChEMBL ID', 'Smiles', 'Standard Value', 'Standard Units',
|
||||
'AlogP', 'Molecular Weight', '#RO5 Violations', 'Ligand Efficiency LE',
|
||||
'Target Name', 'Target ChEMBL ID']]
|
||||
# prepare data
|
||||
import pandas as pd
|
||||
|
||||
# Assuming qsar_df is your original DataFrame
|
||||
qsar_df_formatted = qsar_df[['Smiles', 'Standard Value']].copy()
|
||||
|
||||
# Rename the 'Standard Value' column to 'TARGET'
|
||||
qsar_df_formatted.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True)
|
||||
|
||||
# Now qsar_df_formatted is ready for training
|
||||
print(qsar_df_formatted.head())
|
||||
|
||||
qsar_df_formatted.to_csv('qsar_training_data.csv', index=False)
|
||||
|
||||
# split data
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
# Load the dataset
|
||||
qsar_df = pd.read_csv('qsar_training_data.csv')
|
||||
|
||||
# Split the data: 90% for training, 10% for testing
|
||||
train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42)
|
||||
|
||||
# Save the resulting datasets to CSV files
|
||||
train_df.to_csv('qsar_training_data.csv', index=False)
|
||||
test_df.to_csv('qsar_test_data.csv', index=False)
|
||||
|
||||
# Output the size of the datasets to verify
|
||||
print(f"Training set size: {train_df.shape[0]} samples")
|
||||
print(f"Test set size: {test_df.shape[0]} samples")
|
||||
|
||||
# train and test
|
||||
|
||||
from unimol_tools import MolTrain, MolPredict
|
||||
|
||||
# Assuming you saved the DataFrame to 'qsar_training_data.csv'
|
||||
train_data = 'qsar_training_data.csv'
|
||||
|
||||
# Train the model
|
||||
clf = MolTrain(task='regression', # or 'classification' based on your needs
|
||||
data_type='molecule',
|
||||
epochs=10,
|
||||
batch_size=16,
|
||||
metrics='mse', # Use 'mse' for regression
|
||||
save_path='./exp'
|
||||
)
|
||||
|
||||
pred = clf.fit(data=train_data)
|
||||
|
||||
## download mol.dict.txt
|
||||
## python script :/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights
|
||||
# Making predictions
|
||||
test_data = 'qsar_test_data.csv' # Replace with your actual test data file
|
||||
clf = MolPredict(load_model='./exp')
|
||||
res = clf.predict(data=test_data)
|
||||
Reference in New Issue
Block a user