Files
qsar/analyse.ipynb
2024-09-28 13:14:52 +08:00

808 lines
39 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# install unimol https://unimol.readthedocs.io/en/latest/installation.html#install\n",
"import pandas as pd\n",
"df = pd.read_csv('/mnt/c/project/qsar/data/A_85.csv',sep=';')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Molecule ChEMBL ID</th>\n",
" <th>Molecule Name</th>\n",
" <th>Molecule Max Phase</th>\n",
" <th>Molecular Weight</th>\n",
" <th>#RO5 Violations</th>\n",
" <th>AlogP</th>\n",
" <th>Compound Key</th>\n",
" <th>Smiles</th>\n",
" <th>Standard Type</th>\n",
" <th>Standard Relation</th>\n",
" <th>...</th>\n",
" <th>Target Type</th>\n",
" <th>Document ChEMBL ID</th>\n",
" <th>Source ID</th>\n",
" <th>Source Description</th>\n",
" <th>Document Journal</th>\n",
" <th>Document Year</th>\n",
" <th>Cell ChEMBL ID</th>\n",
" <th>Properties</th>\n",
" <th>Action Type</th>\n",
" <th>Standard Text Value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CHEMBL5184894</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>916.22</td>\n",
" <td>3</td>\n",
" <td>5.74</td>\n",
" <td>C-3</td>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>MIC</td>\n",
" <td>'='</td>\n",
" <td>...</td>\n",
" <td>ORGANISM</td>\n",
" <td>CHEMBL5126592</td>\n",
" <td>1</td>\n",
" <td>Scientific Literature</td>\n",
" <td>Eur J Med Chem</td>\n",
" <td>2022</td>\n",
" <td>None</td>\n",
" <td>Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CHEMBL5198466</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>902.19</td>\n",
" <td>3</td>\n",
" <td>5.35</td>\n",
" <td>C-2</td>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>MIC</td>\n",
" <td>'='</td>\n",
" <td>...</td>\n",
" <td>ORGANISM</td>\n",
" <td>CHEMBL5126592</td>\n",
" <td>1</td>\n",
" <td>Scientific Literature</td>\n",
" <td>Eur J Med Chem</td>\n",
" <td>2022</td>\n",
" <td>None</td>\n",
" <td>Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CHEMBL5179714</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>976.02</td>\n",
" <td>3</td>\n",
" <td>5.80</td>\n",
" <td>A-7</td>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>MIC</td>\n",
" <td>'='</td>\n",
" <td>...</td>\n",
" <td>ORGANISM</td>\n",
" <td>CHEMBL5126592</td>\n",
" <td>1</td>\n",
" <td>Scientific Literature</td>\n",
" <td>Eur J Med Chem</td>\n",
" <td>2022</td>\n",
" <td>None</td>\n",
" <td>Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CHEMBL5190735</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>904.16</td>\n",
" <td>2</td>\n",
" <td>3.93</td>\n",
" <td>C-4</td>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>MIC</td>\n",
" <td>'='</td>\n",
" <td>...</td>\n",
" <td>ORGANISM</td>\n",
" <td>CHEMBL5126592</td>\n",
" <td>1</td>\n",
" <td>Scientific Literature</td>\n",
" <td>Eur J Med Chem</td>\n",
" <td>2022</td>\n",
" <td>None</td>\n",
" <td>Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CHEMBL5199514</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>897.13</td>\n",
" <td>3</td>\n",
" <td>5.03</td>\n",
" <td>1-23(A-2)</td>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>MIC</td>\n",
" <td>'='</td>\n",
" <td>...</td>\n",
" <td>ORGANISM</td>\n",
" <td>CHEMBL5126592</td>\n",
" <td>1</td>\n",
" <td>Scientific Literature</td>\n",
" <td>Eur J Med Chem</td>\n",
" <td>2022</td>\n",
" <td>None</td>\n",
" <td>Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 47 columns</p>\n",
"</div>"
],
"text/plain": [
" Molecule ChEMBL ID Molecule Name Molecule Max Phase Molecular Weight \\\n",
"0 CHEMBL5184894 NaN None 916.22 \n",
"1 CHEMBL5198466 NaN None 902.19 \n",
"2 CHEMBL5179714 NaN None 976.02 \n",
"3 CHEMBL5190735 NaN None 904.16 \n",
"4 CHEMBL5199514 NaN None 897.13 \n",
"\n",
" #RO5 Violations AlogP Compound Key \\\n",
"0 3 5.74 C-3 \n",
"1 3 5.35 C-2 \n",
"2 3 5.80 A-7 \n",
"3 2 3.93 C-4 \n",
"4 3 5.03 1-23(A-2) \n",
"\n",
" Smiles Standard Type \\\n",
"0 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... MIC \n",
"1 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... MIC \n",
"2 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... MIC \n",
"3 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... MIC \n",
"4 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... MIC \n",
"\n",
" Standard Relation ... Target Type Document ChEMBL ID Source ID \\\n",
"0 '=' ... ORGANISM CHEMBL5126592 1 \n",
"1 '=' ... ORGANISM CHEMBL5126592 1 \n",
"2 '=' ... ORGANISM CHEMBL5126592 1 \n",
"3 '=' ... ORGANISM CHEMBL5126592 1 \n",
"4 '=' ... ORGANISM CHEMBL5126592 1 \n",
"\n",
" Source Description Document Journal Document Year Cell ChEMBL ID \\\n",
"0 Scientific Literature Eur J Med Chem 2022 None \n",
"1 Scientific Literature Eur J Med Chem 2022 None \n",
"2 Scientific Literature Eur J Med Chem 2022 None \n",
"3 Scientific Literature Eur J Med Chem 2022 None \n",
"4 Scientific Literature Eur J Med Chem 2022 None \n",
"\n",
" Properties Action Type \\\n",
"0 Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs NaN \n",
"1 Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs NaN \n",
"2 Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs NaN \n",
"3 Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs NaN \n",
"4 Time_Lower = 16.0 hrs | Time_Upper = 20.0 hrs NaN \n",
"\n",
" Standard Text Value \n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
"[5 rows x 47 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Molecule ChEMBL ID', 'Molecule Name', 'Molecule Max Phase',\n",
" 'Molecular Weight', '#RO5 Violations', 'AlogP', 'Compound Key',\n",
" 'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',\n",
" 'Standard Units', 'pChEMBL Value', 'Data Validity Comment', 'Comment',\n",
" 'Uo Units', 'Ligand Efficiency BEI', 'Ligand Efficiency LE',\n",
" 'Ligand Efficiency LLE', 'Ligand Efficiency SEI', 'Potential Duplicate',\n",
" 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',\n",
" 'BAO Label', 'Assay Organism', 'Assay Tissue ChEMBL ID',\n",
" 'Assay Tissue Name', 'Assay Cell Type', 'Assay Subcellular Fraction',\n",
" 'Assay Parameters', 'Assay Variant Accession', 'Assay Variant Mutation',\n",
" 'Target ChEMBL ID', 'Target Name', 'Target Organism', 'Target Type',\n",
" 'Document ChEMBL ID', 'Source ID', 'Source Description',\n",
" 'Document Journal', 'Document Year', 'Cell ChEMBL ID', 'Properties',\n",
" 'Action Type', 'Standard Text Value'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Activity', 'Inhibition', 'MBC', 'MIC', 'Ratio'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"set(df['Standard Type'].to_list())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# select MIC\n",
"df_with_MIC = df[df['Standard Type'] == 'MIC']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# 选择关键的活性和结构数据\n",
"qsar_df = df_with_MIC[['Molecule ChEMBL ID', 'Smiles', 'Standard Value', 'Standard Units', \n",
" 'AlogP', 'Molecular Weight', '#RO5 Violations', 'Ligand Efficiency LE',\n",
" 'Target Name', 'Target ChEMBL ID']]\n",
"\n",
"# 如果需要,可以进一步处理或清洗数据\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" SMILES TARGET\n",
"0 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 0.5\n",
"1 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 0.5\n",
"2 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 32.0\n",
"3 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 16.0\n",
"4 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 4.0\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Assuming qsar_df is your original DataFrame\n",
"qsar_df_formatted = qsar_df[['Smiles', 'Standard Value']].copy()\n",
"\n",
"# Rename the 'Standard Value' column to 'TARGET'\n",
"qsar_df_formatted.rename(columns={'Smiles': 'SMILES', 'Standard Value': 'TARGET'}, inplace=True)\n",
"\n",
"# Now qsar_df_formatted is ready for training\n",
"print(qsar_df_formatted.head())\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"qsar_df_formatted.to_csv('qsar_training_data.csv', index=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```shell\n",
"sudo apt-get update\n",
"sudo apt-get install --only-upgrade libstdc++6\n",
"```\n",
"https://unimol.readthedocs.io/en/latest/tutorial.html\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training set size: 151 samples\n",
"Test set size: 38 samples\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the dataset\n",
"qsar_df = pd.read_csv('qsar_training_data.csv')\n",
"\n",
"# Split the data: 90% for training, 10% for testing\n",
"train_df, test_df = train_test_split(qsar_df, test_size=0.2, random_state=42)\n",
"\n",
"# Save the resulting datasets to CSV files\n",
"train_df.to_csv('qsar_training_data.csv', index=False)\n",
"test_df.to_csv('qsar_test_data.csv', index=False)\n",
"\n",
"# Output the size of the datasets to verify\n",
"print(f\"Training set size: {train_df.shape[0]} samples\")\n",
"print(f\"Test set size: {test_df.shape[0]} samples\")\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:45:27 | unimol_tools/weights/weighthub.py | 17 | INFO | Uni-Mol Tools | Weights will be downloaded to default directory: /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n"
]
}
],
"source": [
"from unimol_tools import MolTrain, MolPredict\n",
"\n",
"# Assuming you saved the DataFrame to 'qsar_training_data.csv'\n",
"train_data = 'qsar_training_data.csv'\n",
"\n",
"# Train the model\n",
"clf = MolTrain(task='regression', # or 'classification' based on your needs\n",
" data_type='molecule', \n",
" epochs=10, \n",
" batch_size=16, \n",
" metrics='mse', # Use 'mse' for regression\n",
" save_path='./exp'\n",
" )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:45:27 | unimol_tools/data/datareader.py | 188 | INFO | Uni-Mol Tools | Anomaly clean with 3 sigma threshold: 151 -> 151\n",
"2024-09-27 22:45:28 | unimol_tools/data/conformer.py | 89 | INFO | Uni-Mol Tools | Start generating conformers...\n",
"151it [00:11, 13.52it/s]\n",
"2024-09-27 22:45:39 | unimol_tools/data/conformer.py | 93 | INFO | Uni-Mol Tools | Succeed to generate conformers for 100.00% of molecules.\n",
"2024-09-27 22:45:39 | unimol_tools/data/conformer.py | 95 | INFO | Uni-Mol Tools | Succeed to generate 3d conformers for 69.54% of molecules.\n",
"2024-09-27 22:45:39 | unimol_tools/train.py | 172 | INFO | Uni-Mol Tools | Output directory already exists: ./exp\n",
"2024-09-27 22:45:39 | unimol_tools/train.py | 173 | INFO | Uni-Mol Tools | Warning: Overwrite output directory: ./exp\n",
"2024-09-27 22:45:39 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:45:40 | unimol_tools/models/nnmodel.py | 142 | INFO | Uni-Mol Tools | start training Uni-Mol:unimolv1\n",
"2024-09-27 22:45:44 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [1/10] train_loss: 1.3586, val_loss: 2.4455, val_mse: 280.7126, lr: 0.000093, 3.3s\n",
"2024-09-27 22:45:45 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [2/10] train_loss: 1.0766, val_loss: 1.1453, val_mse: 131.8109, lr: 0.000082, 0.8s\n",
"2024-09-27 22:45:47 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [3/10] train_loss: 0.9960, val_loss: 1.0161, val_mse: 116.7266, lr: 0.000072, 0.6s\n",
"2024-09-27 22:45:48 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [4/10] train_loss: 1.0533, val_loss: 1.1678, val_mse: 133.8159, lr: 0.000062, 0.7s\n",
"2024-09-27 22:45:49 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [5/10] train_loss: 0.8922, val_loss: 0.9024, val_mse: 103.2530, lr: 0.000051, 0.7s\n",
"2024-09-27 22:45:51 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [6/10] train_loss: 0.8443, val_loss: 1.2440, val_mse: 142.2061, lr: 0.000041, 0.7s\n",
"2024-09-27 22:45:51 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [7/10] train_loss: 0.9452, val_loss: 0.8510, val_mse: 97.0552, lr: 0.000031, 0.7s\n",
"2024-09-27 22:45:53 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [8/10] train_loss: 0.9270, val_loss: 1.0135, val_mse: 115.5859, lr: 0.000021, 0.7s\n",
"2024-09-27 22:45:53 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [9/10] train_loss: 0.8749, val_loss: 0.9838, val_mse: 112.1781, lr: 0.000010, 0.7s\n",
"2024-09-27 22:45:54 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [10/10] train_loss: 0.9159, val_loss: 1.0383, val_mse: 118.4027, lr: 0.000000, 0.7s\n",
"2024-09-27 22:45:55 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:45:55 | unimol_tools/models/nnmodel.py | 168 | INFO | Uni-Mol Tools | fold 0, result {'mse': 97.05515, 'mae': 7.331252, 'pearsonr': 0.5909511971112806, 'spearmanr': 0.5808793020991461, 'r2': 0.2890600562095642}\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:45:55 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"2024-09-27 22:45:56 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [1/10] train_loss: 1.3883, val_loss: 0.6170, val_mse: 73.4399, lr: 0.000093, 0.7s\n",
"2024-09-27 22:45:58 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [2/10] train_loss: 1.3161, val_loss: 0.9960, val_mse: 114.1673, lr: 0.000082, 0.8s\n",
"2024-09-27 22:45:58 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [3/10] train_loss: 1.1456, val_loss: 0.5966, val_mse: 70.8534, lr: 0.000072, 0.6s\n",
"2024-09-27 22:46:00 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [4/10] train_loss: 1.1155, val_loss: 0.6260, val_mse: 74.4525, lr: 0.000062, 0.8s\n",
"2024-09-27 22:46:01 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [5/10] train_loss: 0.9599, val_loss: 0.7855, val_mse: 90.7978, lr: 0.000051, 0.7s\n",
"2024-09-27 22:46:01 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [6/10] train_loss: 1.0833, val_loss: 0.7717, val_mse: 89.3328, lr: 0.000041, 0.7s\n",
"2024-09-27 22:46:02 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [7/10] train_loss: 1.0203, val_loss: 1.0910, val_mse: 124.0563, lr: 0.000031, 0.7s\n",
"2024-09-27 22:46:03 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [8/10] train_loss: 0.9478, val_loss: 0.7585, val_mse: 89.1448, lr: 0.000021, 0.7s\n",
"2024-09-27 22:46:03 | unimol_tools/utils/metrics.py | 234 | WARNING | Uni-Mol Tools | Early stopping at epoch: 8\n",
"2024-09-27 22:46:03 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:04 | unimol_tools/models/nnmodel.py | 168 | INFO | Uni-Mol Tools | fold 1, result {'mse': 70.85342, 'mae': 5.747976, 'pearsonr': 0.20143281994387507, 'spearmanr': 0.2609365195530784, 'r2': 0.035624027252197266}\n",
"2024-09-27 22:46:04 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:46:05 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [1/10] train_loss: 1.4606, val_loss: 3.3206, val_mse: 378.6044, lr: 0.000093, 0.7s\n",
"2024-09-27 22:46:06 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [2/10] train_loss: 1.4898, val_loss: 0.9078, val_mse: 105.1634, lr: 0.000082, 0.6s\n",
"2024-09-27 22:46:08 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [3/10] train_loss: 1.0155, val_loss: 1.0172, val_mse: 117.1504, lr: 0.000072, 0.7s\n",
"2024-09-27 22:46:09 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [4/10] train_loss: 0.9078, val_loss: 1.0485, val_mse: 120.6293, lr: 0.000062, 0.7s\n",
"2024-09-27 22:46:09 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [5/10] train_loss: 0.9419, val_loss: 1.0336, val_mse: 119.9488, lr: 0.000051, 0.6s\n",
"2024-09-27 22:46:10 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [6/10] train_loss: 0.9781, val_loss: 0.9392, val_mse: 108.4770, lr: 0.000041, 0.6s\n",
"2024-09-27 22:46:10 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [7/10] train_loss: 0.8445, val_loss: 0.8251, val_mse: 95.0762, lr: 0.000031, 0.7s\n",
"2024-09-27 22:46:12 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [8/10] train_loss: 0.9550, val_loss: 0.8084, val_mse: 93.2420, lr: 0.000021, 0.6s\n",
"2024-09-27 22:46:13 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [9/10] train_loss: 0.7777, val_loss: 0.8005, val_mse: 92.2463, lr: 0.000010, 0.7s\n",
"2024-09-27 22:46:15 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [10/10] train_loss: 0.7272, val_loss: 0.8880, val_mse: 101.9108, lr: 0.000000, 0.7s\n",
"2024-09-27 22:46:16 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:16 | unimol_tools/models/nnmodel.py | 168 | INFO | Uni-Mol Tools | fold 2, result {'mse': 92.246346, 'mae': 6.6947527, 'pearsonr': 0.2854272922850214, 'spearmanr': 0.3806770274531352, 'r2': 0.010852813720703125}\n",
"2024-09-27 22:46:16 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:46:17 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [1/10] train_loss: 1.2635, val_loss: 1.7796, val_mse: 199.5377, lr: 0.000093, 0.6s\n",
"2024-09-27 22:46:18 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [2/10] train_loss: 1.1977, val_loss: 0.8583, val_mse: 97.1572, lr: 0.000082, 0.6s\n",
"2024-09-27 22:46:19 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [3/10] train_loss: 1.1185, val_loss: 0.8756, val_mse: 100.4343, lr: 0.000072, 0.7s\n",
"2024-09-27 22:46:20 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [4/10] train_loss: 0.9613, val_loss: 0.9776, val_mse: 109.5843, lr: 0.000062, 0.7s\n",
"2024-09-27 22:46:21 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [5/10] train_loss: 1.1364, val_loss: 0.7891, val_mse: 90.3483, lr: 0.000051, 0.7s\n",
"2024-09-27 22:46:22 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [6/10] train_loss: 0.9706, val_loss: 0.8194, val_mse: 92.2830, lr: 0.000041, 0.7s\n",
"2024-09-27 22:46:23 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [7/10] train_loss: 0.9130, val_loss: 0.6754, val_mse: 75.7993, lr: 0.000031, 0.6s\n",
"2024-09-27 22:46:24 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [8/10] train_loss: 0.9331, val_loss: 0.6883, val_mse: 77.6490, lr: 0.000021, 0.6s\n",
"2024-09-27 22:46:25 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [9/10] train_loss: 0.8835, val_loss: 0.6737, val_mse: 75.2476, lr: 0.000010, 0.7s\n",
"2024-09-27 22:46:26 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [10/10] train_loss: 0.8302, val_loss: 0.6861, val_mse: 76.5509, lr: 0.000000, 0.6s\n",
"2024-09-27 22:46:27 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:27 | unimol_tools/models/nnmodel.py | 168 | INFO | Uni-Mol Tools | fold 3, result {'mse': 75.2476, 'mae': 6.477419, 'pearsonr': 0.525078779024625, 'spearmanr': 0.41594596740381523, 'r2': 0.2580321431159973}\n",
"2024-09-27 22:46:27 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:46:28 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [1/10] train_loss: 1.2777, val_loss: 1.9819, val_mse: 227.5000, lr: 0.000093, 0.7s\n",
"2024-09-27 22:46:30 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [2/10] train_loss: 1.4327, val_loss: 3.0447, val_mse: 346.6302, lr: 0.000082, 0.7s\n",
"2024-09-27 22:46:30 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [3/10] train_loss: 1.2044, val_loss: 1.2728, val_mse: 145.1859, lr: 0.000072, 0.6s\n",
"2024-09-27 22:46:32 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [4/10] train_loss: 0.9925, val_loss: 1.7034, val_mse: 194.0388, lr: 0.000062, 0.6s\n",
"2024-09-27 22:46:32 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [5/10] train_loss: 0.8893, val_loss: 1.0293, val_mse: 117.8387, lr: 0.000051, 0.7s\n",
"2024-09-27 22:46:34 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [6/10] train_loss: 0.9734, val_loss: 1.1283, val_mse: 129.1722, lr: 0.000041, 0.6s\n",
"2024-09-27 22:46:35 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [7/10] train_loss: 0.9078, val_loss: 1.1153, val_mse: 127.7389, lr: 0.000031, 0.7s\n",
"2024-09-27 22:46:35 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [8/10] train_loss: 1.0021, val_loss: 0.8698, val_mse: 99.5525, lr: 0.000021, 0.7s\n",
"2024-09-27 22:46:37 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [9/10] train_loss: 0.8148, val_loss: 1.0937, val_mse: 125.1105, lr: 0.000010, 0.7s\n",
"2024-09-27 22:46:37 | unimol_tools/tasks/trainer.py | 210 | INFO | Uni-Mol Tools | Epoch [10/10] train_loss: 0.9359, val_loss: 1.1535, val_mse: 131.9864, lr: 0.000000, 0.7s\n",
"2024-09-27 22:46:38 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:38 | unimol_tools/models/nnmodel.py | 168 | INFO | Uni-Mol Tools | fold 4, result {'mse': 99.55246, 'mae': 7.1317773, 'pearsonr': 0.6565703302621841, 'spearmanr': 0.7090074325220541, 'r2': 0.328890860080719}\n",
"2024-09-27 22:46:38 | unimol_tools/models/nnmodel.py | 183 | INFO | Uni-Mol Tools | Uni-Mol metrics score: \n",
"{'mse': 87.05764177737245, 'mae': 6.680970421546713, 'pearsonr': 0.4982811863609151, 'spearmanr': 0.49812542784550135, 'r2': 0.23865339883639336}\n",
"2024-09-27 22:46:38 | unimol_tools/models/nnmodel.py | 184 | INFO | Uni-Mol Tools | Uni-Mol & Metric result saved!\n"
]
}
],
"source": [
"pred = clf.fit(data=train_data)\n",
"\n",
"## download mol.dict.txt\n",
"## python script :/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:46:39 | unimol_tools/data/conformer.py | 89 | INFO | Uni-Mol Tools | Start generating conformers...\n",
"38it [00:05, 6.67it/s]\n",
"2024-09-27 22:46:44 | unimol_tools/data/conformer.py | 93 | INFO | Uni-Mol Tools | Succeed to generate conformers for 100.00% of molecules.\n",
"2024-09-27 22:46:44 | unimol_tools/data/conformer.py | 95 | INFO | Uni-Mol Tools | Succeed to generate 3d conformers for 73.68% of molecules.\n",
"2024-09-27 22:46:45 | unimol_tools/models/unimol.py | 124 | INFO | Uni-Mol Tools | Loading pretrained weights from /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"WEIGHT_DIR /opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights\n",
"MODEL_CONFIG {'weight': {'protein': 'poc_pre_220816.pt', 'molecule_no_h': 'mol_pre_no_h_220816.pt', 'molecule_all_h': 'mol_pre_all_h_220816.pt', 'crystal': 'mp_all_h_230313.pt', 'oled': 'oled_pre_no_h_230101.pt'}, 'dict': {'protein': 'poc.dict.txt', 'molecule_no_h': 'mol.dict.txt', 'molecule_all_h': 'mol.dict.txt', 'crystal': 'mp.dict.txt', 'oled': 'oled.dict.txt'}}\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol_pre_all_h_220816.pt\n",
"/opt/conda/envs/analyse/lib/python3.11/site-packages/unimol_tools/weights/mol.dict.txt\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-09-27 22:46:45 | unimol_tools/models/nnmodel.py | 206 | INFO | Uni-Mol Tools | start predict NNModel:unimolv1\n",
"2024-09-27 22:46:47 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:48 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:50 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:51 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:53 | unimol_tools/tasks/trainer.py | 300 | INFO | Uni-Mol Tools | load model success!\n",
"2024-09-27 22:46:53 | unimol_tools/predict.py | 92 | INFO | Uni-Mol Tools | final predict metrics score: \n",
"{'mse': 59.72037918598548, 'mae': 5.179289798987539, 'pearsonr': 0.638764928149331, 'spearmanr': 0.6006870492749102, 'r2': 0.35928715315601223}\n"
]
}
],
"source": [
"# Making predictions\n",
"test_data = 'qsar_test_data.csv' # Replace with your actual test data file\n",
"clf = MolPredict(load_model='./exp')\n",
"res = clf.predict(data=test_data)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SMILES</th>\n",
" <th>TARGET</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>0.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>0.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>32.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>16.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]...</td>\n",
" <td>4.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SMILES TARGET\n",
"0 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 0.5\n",
"1 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 0.5\n",
"2 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 32.0\n",
"3 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 16.0\n",
"4 CC[C@H]1OC(=O)C[C@@H](O)[C@H](C)[C@@H](O[C@@H]... 4.0"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"qsar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"189"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(qsar_df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# QSAR https://bohrium.dp.tech/notebooks/1032"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "analyse",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}