frist add

This commit is contained in:
mm644706215
2025-09-16 20:49:33 +08:00
commit 92850a0936
3 changed files with 365 additions and 0 deletions

40
README.md Normal file
View File

@@ -0,0 +1,40 @@
## 环境安装
```bash
conda create -n chemplot_env -y -c conda-forge chemplot
conda activate chemplot_env
# 失败可以用pip 进行安装尝试 与mordred兼容
pip install "networkx==2.8.8" chemplot pandas matplotlib scikit-learn umap-learn rdkit-pypi
```
## t-SNE 的局限性
假设有三个分子系列 A、B、C实际空间距离 A↔B 比 A↔C 近:
t-SNE 可能只把 A、B 内部的点聚在一起,但 A↔B vs A↔C 的距离无法反映真实远近关系。
这意味着如果用 t-SNE 后结果做 KMeans可能出现不合理的簇划分。
## UMAP 的优势
在 UMAP 中A、B、C 的相对距离更接近原始分子指纹空间。
当你在降维后的空间运行 KMeans 或 HDBSCAN 时,聚类结果与真实分子结构差异更一致。
t-SNE 的时间复杂度接近 O(N^2),数据量增加后计算时间会爆炸式增长。
UMAP 在大规模分子库上表现更好,并且支持并行计算。
UMAP 是更合适的选择,因为:
它更好地保留了化学空间的全局结构,确保聚类结果有化学意义。
稳定、可复现,适合后续湿实验追踪和比较。
计算速度更快,支持未来大规模分子库扩展。
在生成交互式 HTML 时,结果更直观、可解释。
## 功能脚本
### chemplot_select_and_mark.py 介绍

View File

@@ -0,0 +1,140 @@
Cleaned_Title,Karma_Count,Vina_Count,Total_Count,Karma_Original_Titles,Vina_Original_Titles,smiles
LICODIONE,4,9,13,LICODIONE,LICODIONE_out_converted.sdf,O=C(/C=C(\[O-])c1ccc(O)cc1)c1ccc(O)cc1O
CPD-591,4,7,11,CPD-591,CPD-591_out_converted.sdf,[O-]c1cc2c([O-])cc(O)cc2[o+]c1-c1ccc(O)c(O)c1
CPD-17791,4,5,9,CPD-17791,CPD-17791_out_converted.sdf,CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)O)O)O)O
CPD-6366,3,4,7,CPD-6366,CPD-6366_out_converted.sdf,COc1ccc2c(c1)OC(O)C(=O)N2[O-]
CNP0359851.1,3,3,6,CNP0359851.1,CNP0359851.1_out_converted.sdf,CC1(C)OC2=C(C[C@H]1O)C(O)=CC(O)=C2C(=O)/C=C/C1=CC=CC=C1
CNP0360022.0,3,3,6,CNP0360022.0,CNP0360022.0_out_converted.sdf,COC1=CC(OCC=C(C)C)=C(OC)C(C2=COC3=CC(O)=CC(O)=C3C2=O)=C1
CNP0288448.3,1,5,6,CNP0288448.3,CNP0288448.3_out_converted.sdf,O=C1C=C(/C=C/C2=CC=C(O)C=C2)[C@@H]2C(=O)[C@H]1[C@@H](C1=CC=C(O)C=C1)[C@@H]2C1=CC(O)=CC(O)=C1
CNP0574550.1,3,3,6,CNP0574550.1,CNP0574550.1_out_converted.sdf,C=C1[O+]=C(OC2=C([O-])CC(OC3=C([O-])CC(/[O+]=C4\CC[C@@]5(C)[C@H](CC[C@@H]6[C@@H]5CC(=O)[C@]5(C)C(C7=C=C([O-])OC7)CC[C@]65[O-])C4)=[O+]C3=C)=[O+]C2=C)CC(=O)C1=O
CNP0076684.0,3,3,6,CNP0076684.0,CNP0076684.0_out_converted.sdf,COC1=C(O)C=CC=C1C1=C(O)C(=O)C2=CC(O)=CC(O)=C2O1
CNP0124299.0,3,3,6,CNP0124299.0,CNP0124299.0_out_converted.sdf,O=C(C1=CN=C2C=CC=CC2=C1)C1=NC(=O)C2=CC=CC=C2N1
CNP0105988.1,3,3,6,CNP0105988.1,CNP0105988.1_out_converted.sdf,COC1=CC(OC)=C([C@H]2COC3=CC(O)=CC(O)=C3C2=O)C=C1O
CNP0256385.0,1,4,5,CNP0256385.0,CNP0256385.0_out_converted.sdf,O=C1C=C(C2=CC=C(O)C=C2)OC2C=C(O)CC(O)=C12
BUTEIN,2,3,5,BUTEIN,BUTEIN_out_converted.sdf,O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O)cc1O
CPD-12949,2,2,4,CPD-12949,CPD-12949_out_converted.sdf,C[C@]1(O)CC(=O)[C@@H]2c3cc4cccc(O)c4c(O)c3C(=O)C[C@]2(O)C1
CNP0404986.0,2,2,4,CNP0404986.0,CNP0404986.0_out_converted.sdf,O=C(C1=CC=C2OCOC2=C1O)C1=NC=CC2=CC3=C(C=C12)OCO3
CNP0426012.0,2,2,4,CNP0426012.0,CNP0426012.0_out_converted.sdf,COC1=CC(O)=C(O)C=C1C1=COC2=CC(O)=CC=C2C1=O
CNP0426334.0,2,2,4,CNP0426334.0,CNP0426334.0_out_converted.sdf,COC1=C(O)C=C2C(=O)C3=CC=C(O)C=C3OC2=C1OC
CNP0497639.1,3,1,4,CNP0497639.1,CNP0497639.1_out_converted.sdf,C[S@+]([O-])C1=C(CN2C3=CC=CC=C3C3=CC=NC=C32)NC=N1
CNP0348231.1,2,2,4,CNP0348231.1,CNP0348231.1_out_converted.sdf,C=C(C)[C@@H](O)CC/C(C)=C/CC1=C(OC)C(O)=CC2=C1C(=O)C1=C(O)C=C(O)C=C1O2
CNP0524070.0,2,2,4,CNP0524070.0,CNP0524070.0_out_converted.sdf,O=C1CC(=C2C=CC(O)C=C2)OC2=CC(O)=CC(O)=C12
CNP0301960.1,2,2,4,CNP0301960.1,CNP0301960.1_out_converted.sdf,O=C1C2=CC(O)=CC=C2N=C2[C@@H](O)CCN21
CPD-12724,2,2,4,CPD-12724,CPD-12724_out_converted.sdf,O=c1cc(-c2ccccc2)oc2cc(O)c(O)c(O)c12
CPD-12726,2,2,4,CPD-12726,CPD-12726_out_converted.sdf,O=c1cc(-c2ccc(O)cc2)oc2cc(O)c(O)c(O)c12
574-TRIHYDROXY-3-METHOXYFLAVONE,2,2,4,574-TRIHYDROXY-3-METHOXYFLAVONE,574-TRIHYDROXY-3-METHOXYFLAVONE_out_converted.sdf,COc1cc(-c2cc(=O)c3c(O)cc([O-])cc3o2)ccc1O
CPD-15721,2,2,4,CPD-15721,CPD-15721_out_converted.sdf,CCC1(CC(=O)C2=C(C3=C(C=C2C1C(=O)OC)C(=O)C4=C(C3=O)C(=CC=C4)O)O)O
CPD-17024,2,2,4,CPD-17024,CPD-17024_out_converted.sdf,C1=CC(=C(C(=C1)O)C(=O)C2=CC(=C(N2)Cl)Cl)O
CPD-17457,2,2,4,CPD-17457,CPD-17457_out_converted.sdf,C1C2C(C3(C=CC(=O)C=C3O1)O)OC4=CC5=C(C=C24)OCO5
CNP0234857.5,2,2,4,CNP0234857.5,CNP0234857.5_out_converted.sdf,CC(=O)O[C@@H]1C=C[C@@]23C4=CC5=C(C=C4CN(C[C@H]2O)[C@H]3C1)OCO5
CPD-19273,1,3,4,CPD-19273,CPD-19273_out_converted.sdf,CC1=C2C=CC=C(C2=C(C3=C1C=C4C=C(C(=C(C4=C3O)[O-])C(=O)N)O)O)O
CPD-21169,2,2,4,CPD-21169,CPD-21169_out_converted.sdf,C1=CC=C(C=C1)C2=NC(=CC(=N2)Cl)SCC(C(=O)[O-])[NH3+]
CPD-3622,2,2,4,CPD-3622,CPD-3622_out_converted.sdf,COc1ccc(-c2coc3cc([O-])cc(O)c3c2=O)cc1O
CPD-3629,2,2,4,CPD-3629,CPD-3629_out_converted.sdf,O=c1c(-c2cc3c(cc2O)OCO3)coc2cc(O)ccc12
CPD-3630,2,2,4,CPD-3630,CPD-3630_out_converted.sdf,O=C1c2ccc(O)cc2OC[C@H]1c1cc2c(cc1O)OCO2
CPD-431,2,2,4,CPD-431,CPD-431_out_converted.sdf,O=c1cc(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12
CPD-7096,2,2,4,CPD-7096,CPD-7096_out_converted.sdf,[O-]c1cc(O)cc2[o+]c(-c3ccc(O)cc3)ccc12
CPD-9539,2,2,4,CPD-9539,CPD-9539_out_converted.sdf,COc1ccc(-c2coc3cc(O)ccc3c2=O)cc1O
CPD1F-90,1,3,4,CPD1F-90,CPD1F-90_out_converted.sdf,O=c1c(O)c(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12
PHOTINUS-LUCIFERIN,2,2,4,PHOTINUS-LUCIFERIN,PHOTINUS-LUCIFERIN_out_converted.sdf,O=C([O-])[C@H]1CSC(c2nc3ccc(O)cc3s2)=N1
CPD-17787,2,2,4,CPD-17787,CPD-17787_out_converted.sdf,CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)OC)O)OC)O
CNP0352637.2,2,2,4,CNP0352637.2,CNP0352637.2_out_converted.sdf,COC1=CC(O)=CC([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)=C1
VESTITONE,2,2,4,VESTITONE,VESTITONE_out_converted.sdf,COc1ccc([C@@H]2COc3cc(O)ccc3C2=O)c(O)c1
C12135,2,2,4,C12135,C12135_out_converted.sdf,COC1=CC(=C(C=C1)C2=COC3=CC(=CC(=C3C2=O)O)O)O
CNP0076169.0,2,2,4,CNP0076169.0,CNP0076169.0_out_converted.sdf,CC1=C(O)C=C(O)C2=C1OC(C1=CC=C(O)C=C1)=CC2=O
CNP0078636.1,2,2,4,CNP0078636.1,CNP0078636.1_out_converted.sdf,COC1=C(O)C(O)=C2C(=O)[C@H](CC3=CC=C(O)C=C3)COC2=C1OC
CNP0080149.0,2,2,4,CNP0080149.0,CNP0080149.0_out_converted.sdf,COC1=C(O)C=C2C(=O)C3=C(CCCC3)C(=O)C2=C1O
CNP0106634.1,2,2,4,CNP0106634.1,CNP0106634.1_out_converted.sdf,CC1(C)C=CC2=C(O)C=CC([C@H]3CC(=O)C4=C(O)C=C(O)C=C4O3)=C2O1
CNP0106817.1,2,2,4,CNP0106817.1,CNP0106817.1_out_converted.sdf,COC1=CC=C([C@@H](C)C(=O)C2=CC=C(OC)C=C2O)C=C1
CNP0126704.1,2,2,4,CNP0126704.1,CNP0126704.1_out_converted.sdf,COC1=CC(C2=COC3=CC(O)=CC(O)=C3C2=O)=CC2=C1O[C@@H](C(C)(C)O)C2
CNP0143640.1,2,2,4,CNP0143640.1,CNP0143640.1_out_converted.sdf,COC(=O)[C@H]1COC(C2=CC=CC=C2O)=N1
CNP0145895.1,2,2,4,CNP0145895.1,CNP0145895.1_out_converted.sdf,COC1=C(C)C(O)=C(C(C)=O)C2=C1[C@]1(C)C(=O)C(C(C)=O)=C(O)C[C@]1(OC)O2
CNP0172162.2,2,2,4,CNP0172162.2,CNP0172162.2_out_converted.sdf,COC1=C(C)C(=O)C2=C(C=C(O)C3=C2O[C@H](C)[C@]3(C)[C@H](O)C/C=C(/C)CO)C1=O
CNP0354771.1,1,2,3,CNP0354771.1,CNP0354771.1_out_converted.sdf,C[C@H](CC(=O)C[C@H](C)[C@@H]1C[C@H](O)[C@]2(C)C3=C(C(=O)C[C@@]12C)[C@]1(C)CCC(=O)C(C)(C)[C@@H]1CC3=O)C(=O)O
CNP0426121.1,1,2,3,CNP0426121.1,CNP0426121.1_out_converted.sdf,COC1=CC=C([C@@H]2CC(=O)C3=C(O)C=C(O)C(CC(O)=C(C)C)=C3O2)C=C1
CNP0426105.1,1,2,3,CNP0426105.1,CNP0426105.1_out_converted.sdf,COC1=CC=C([C@@H]2CCC3=C(O)C=C(O)C=C3O2)C=C1
CNP0401138.0,1,2,3,CNP0401138.0,CNP0401138.0_out_converted.sdf,O=S(=O)(O)C1=CC=C2C(=C1)OC1=CC=CC=C12
CNP0425564.0,1,2,3,CNP0425564.0,CNP0425564.0_out_converted.sdf,COC(=O)C1=CC=C(OC)C(OC2=CC=C(C(=O)O)C=C2)=C1
CNP0213003.1,1,2,3,CNP0213003.1,CNP0213003.1_out_converted.sdf,CN1C(=N)N(C)[C@@H](CC2=CNC3=CC=CC=C23)C1=O
CNP0279015.1,1,2,3,CNP0279015.1,CNP0279015.1_out_converted.sdf,C[C@@H]1CC[C@H]2C(C)(C)CCC[C@]2(C)/C1=C/C1=CC(O)=C(O)C(/C=N/CCS(=O)(=O)O)=C1OS(=O)(=O)O
CNP0300571.1,1,1,2,CNP0300571.1,CNP0300571.1_out_converted.sdf,CO[C@@H]1C[C@@H](C2=CC=C(O)C=C2)OC2=CC(O)=CC=C21
CPD-15172,1,1,2,CPD-15172,CPD-15172_out_converted.sdf,C1=CC=C(C=C1)C2=CC(=O)C3=C(O2)C=C(C(=O)C3=O)O
CNP0138834.1,1,1,2,CNP0138834.1,CNP0138834.1_out_converted.sdf,CC(C)=CCC1=CC([C@@H]2COC3=CC(O)=CC(O)=C3C2=O)=CC(O)=C1O
CNP0139950.1,1,1,2,CNP0139950.1,CNP0139950.1_out_converted.sdf,C=C(C)[C@@H](O)CC1=CC(C2=CC(=O)C3=C(O)C=C(O)C=C3O2)=CC2=C1OC(C)(C)C=C2
CPD-17072,1,1,2,CPD-17072,CPD-17072_out_converted.sdf,CC(=O)C1=C(C(NC1=O)CC2=CNC3=CC=CC=C32)[O-]
CNP0142417.1,1,1,2,CNP0142417.1,CNP0142417.1_out_converted.sdf,C=C(C)[C@@H]1CC2=C(O)C=CC(C3=COC4=CC(O)=CC(O)=C4C3=O)=C2O1
CPD-15777,1,1,2,CPD-15777,CPD-15777_out_converted.sdf,CC1C(=CC=CN1CC2=CN=C(N=C2N)C)CCO
CNP0142712.2,1,1,2,CNP0142712.2,CNP0142712.2_out_converted.sdf,COC1=CC2=C(C=C1OC)[C@@H](C(=O)C1=CC=C3OC=CC3=C1O)CCO2
CPD-15265,1,1,2,CPD-15265,CPD-15265_out_converted.sdf,C1CC2C(=O)NC(C(=O)N2C1)CC3=CNC4=CC=CC=C43
CPD-14164,1,1,2,CPD-14164,CPD-14164_out_converted.sdf,C1=CC(=C(C(=C1)O)O)C2=C(C(=CC=C2)O)O
CNP0135275.1,1,1,2,CNP0135275.1,CNP0135275.1_out_converted.sdf,C=C1CC[C@@H](O)[C@]2(C)CC[C@@H]([C@@H](C)C(=O)O)C[C@H]12
CNP0260962.2,1,1,2,CNP0260962.2,CNP0260962.2_out_converted.sdf,CC1=CC(C)(C)C(C=O)=C[C@@H]1OC(=O)/C=C(\C)CO
CNP0273031.1,1,1,2,CNP0273031.1,CNP0273031.1_out_converted.sdf,C[C@@H]1CC[C@]2(C)[C@H](CO)CCC[C@@H]2[C@@]1(C)CC/C(=C/CO)CO
CNP0146656.1,1,1,2,CNP0146656.1,CNP0146656.1_out_converted.sdf,C[C@@H]1C(=O)O[C@H]2[C@H]1[C@@H](O)C[C@@]1(C)C(=O)C=C[C@](C)(O)[C@H]21
CPD-12022,1,1,2,CPD-12022,CPD-12022_out_converted.sdf,COc1ccc(NC=O)c(C(=O)CCNC(C)=O)c1
CPD-11556,1,1,2,CPD-11556,CPD-11556_out_converted.sdf,Cc1cc(O)cc2c1C(=O)CC(O)(Cc1cc([O-])cc(=O)o1)O2
CPD-11553,1,1,2,CPD-11553,CPD-11553_out_converted.sdf,Cc1cc(O)cc(O)c1-c1cc([O-])cc(=O)o1
CPD-10176,1,1,2,CPD-10176,CPD-10176_out_converted.sdf,O=C1c2cc([O-])cc(O)c2C(=O)c2c1cc1c(c2O)[C@@H]2C=CO[C@@H]2O1
CNP0136916.1,1,1,2,CNP0136916.1,CNP0136916.1_out_converted.sdf,COC1=CC(C)=C(C(=O)O[C@@H]2C[C@]3(C)[C@H]4[C@@H](O)C(C)(C)C[C@@H]4C=C(CO)[C@]23O)C(O)=C1
CPD-19294,1,1,2,CPD-19294,CPD-19294_out_converted.sdf,C1C(OC2=C(C1=O)C=CC(=C2)O)C3=CC(=C(C=C3)O)O
CNP0148342.3,1,1,2,CNP0148342.3,CNP0148342.3_out_converted.sdf,COC1=CC=C2C(=C1)OC[C@@]1(O)C3=CC4=C(C=C3O[C@@H]21)OCO4
CNP0260370.1,1,1,2,CNP0260370.1,CNP0260370.1_out_converted.sdf,CC1=C(C)C2=CC(=O)[C@](C)(O)[C@H](O)[C@@H]2CO1
CNP0232261.2,1,1,2,CNP0232261.2,CNP0232261.2_out_converted.sdf,COC1=CC=C([C@H]2COC3=CC(O)=CC=C3C2)C(O)=C1O
C16405,1,1,2,C16405,C16405_out_converted.sdf,COC1=C(C=CC(=C1)C=CC(=O)C2=C(C=C(C=C2O)O)O)O
DCPIP,1,1,2,DCPIP,DCPIP_out_converted.sdf,O=C1C(Cl)=CC(=Nc2ccc(O)cc2)C=C1Cl
C19588,1,1,2,C19588,C19588_out_converted.sdf,COC1=C2C3=C(C(=O)CC3)C(=O)OC2=C4C5C(C(OC5OC4=C1)O)O
CPD-9557,1,1,2,CPD-9557,CPD-9557_out_converted.sdf,Cc1cc(O)c2c(c1)Cc1cc(O)cc(O)c1C2=O
CNP0216219.17,1,1,2,CNP0216219.17,CNP0216219.17_out_converted.sdf,CO[C@H]1C=C[C@@]23C4=CC5=C(C=C4[C@H](O)[N@@](C[C@@H]2O)[C@H]3C1)OCO5
CPD-8215,1,1,2,CPD-8215,CPD-8215_out_converted.sdf,Cc1cc(O)c2c(c1)C(=O)c1cc([O-])cc(O)c1C2=O
CNP0243499.1,1,1,2,CNP0243499.1,CNP0243499.1_out_converted.sdf,COC(=O)/C(C)=C/[C@@H](O)C1=CC2=CC=C(OC)C(O)=C2OC1=O
CPD-7027,1,1,2,CPD-7027,CPD-7027_out_converted.sdf,COc1cc2c(=O)c(-c3ccc(O)cc3)coc2cc1O
CPD-6955,1,1,2,CPD-6955,CPD-6955_out_converted.sdf,O=C(/C=C/c1ccc(O)cc1)Cc1cc([O-])cc(=O)o1
CPD-693,1,1,2,CPD-693,CPD-693_out_converted.sdf,CC1=CC(=O)CC(C)(C)[C@@]1(O)/C=C/C(C)=C\C(=O)[O-]
CNP0248017.2,1,1,2,CNP0248017.2,CNP0248017.2_out_converted.sdf,CCC(=O)NCC[C@@]1(O)C2=C(N=CS2)C2=NC=CC3=C4C=CC=CC4=NC1=C23
CNP0090058.1,1,1,2,CNP0090058.1,CNP0090058.1_out_converted.sdf,CCOC(=O)C1=C(C=O)C2=C3CCC(=O)[C@H](C3)C3=CC4=C(C=C3CC[C@@H](CO)COC3=C2C(=C(CO)C2=C3C[C@@H]([C@](C)(O)CCCO)O2)O1)NC=C4
CPD-498,1,1,2,CPD-498,CPD-498_out_converted.sdf,C=C1C[C@]23C[C@H]1CC[C@@H]2C1=CC(=O)C[C@@](C)(C(=O)[O-])[C@H]1[C@@H]3C(=O)[O-]
CNP0251052.1,1,1,2,CNP0251052.1,CNP0251052.1_out_converted.sdf,C[C@@H](CO)[C@@H](C)C[C@@H](O)[C@](C)(O)[C@H]1CC[C@@]2(O)C3=CC(=O)[C@]4(O)C[C@@H](O)[C@@H](O)C[C@]4(C)[C@H]3CC[C@]12C
CNP0120483.1,1,1,2,CNP0120483.1,CNP0120483.1_out_converted.sdf,C=C(C)C1=CC2=C3O[C@H]4C5=CC=C(O)C=C5OC[C@@]4(O)C3=CC=C2O1
CNP0122740.1,1,1,2,CNP0122740.1,CNP0122740.1_out_converted.sdf,C=C1CC[C@@H]2[C@@](C)(CO)C[C@@H](O)C[C@]2(C)[C@@H]1CCC1=CC(=O)OC1
CNP0147378.1,1,1,2,CNP0147378.1,CNP0147378.1_out_converted.sdf,CC(=O)OC[C@]12O[C@]34C=C(C)C(=O)C[C@]3(C)[C@@]1(C)[C@H](O)C[C@H]2O4
CNP0153223.1,1,1,2,CNP0153223.1,CNP0153223.1_out_converted.sdf,CCC[C@H]1OCC2=C([C@@H](O)[C@@H]3O[C@@H]3C2=O)[C@@H]1O
CNP0210135.1,1,1,2,CNP0210135.1,CNP0210135.1_out_converted.sdf,COC1=CC=C2C(=C1)O[C@@H]1C3=CC(OC)=C(O)C=C3OC[C@H]21
CNP0426805.1,1,1,2,CNP0426805.1,CNP0426805.1_out_converted.sdf,CC[C@@H]1CN2CCC3=C(NC4=CC=CC=C34)C2=C/C1=C(\C=O)C(=O)OC
CNP0191964.1,1,1,2,CNP0191964.1,CNP0191964.1_out_converted.sdf,C/C=C(\C)[C@H]1OC2=C(C(=O)NC=C2C2=CC=CC=C2)[C@]1(C)CO
CNP0199889.5,1,1,2,CNP0199889.5,CNP0199889.5_out_converted.sdf,C=C1C(=O)O[C@@H]2CC(C)=C([C@@H](C)CCCOC(C)=O)[C@H](O)[C@@H]12
CNP0359684.0,1,1,2,CNP0359684.0,CNP0359684.0_out_converted.sdf,COC1=CC2=C(OC3=CC=C(O)C=C3C2=O)C(OC)=C1OC
CNP0292276.1,1,1,2,CNP0292276.1,CNP0292276.1_out_converted.sdf,C[C@H](C(=O)O)[C@H]1CCC2=CC(=O)C[C@H](C)[C@@]2(C)C1
CNP0353297.0,1,1,2,CNP0353297.0,CNP0353297.0_out_converted.sdf,COC1=CC2=C(C=C1O)C(C=O)=C(C1=CC=C(O)C=C1O)O2
CNP0204494.2,1,1,2,CNP0204494.2,CNP0204494.2_out_converted.sdf,COC1=CC(O)=CC2=C1C(=O)[C@@H](C1=CC=C3OC(C)(C)C=CC3=C1O)CO2
CNP0352117.1,1,1,2,CNP0352117.1,CNP0352117.1_out_converted.sdf,CC[C@H](C)C[C@@]1(C)C=C/C(=C2/C(=O)O[C@H](CC(=O)O)C2=O)O1
CNP0204979.2,1,1,2,CNP0204979.2,CNP0204979.2_out_converted.sdf,CC(C)=CCC1=C([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)C=CC(O)=C1O
CNP0342898.3,1,1,2,CNP0342898.3,CNP0342898.3_out_converted.sdf,COC1=CC(O)=CC([C@H](/C=C\C2=CC=C(O)C=C2)[C@@H](O)CO)=C1
CNP0329144.1,1,1,2,CNP0329144.1,CNP0329144.1_out_converted.sdf,C=C(C)[C@@H](O)CC1=C(OC)C(=O)C2=CC(OC)=C(O)C(O)=C2C1=O
CNP0328219.1,1,1,2,CNP0328219.1,CNP0328219.1_out_converted.sdf,C=C1[C@@H](O)C[C@H]2C(C)(C)C[C@H](Cl)C[C@@]2(C)[C@H]1C[C@H](O)[C@H]1CC(=O)NC1=O
CNP0320138.1,1,1,2,CNP0320138.1,CNP0320138.1_out_converted.sdf,C=C(C)[C@@H](CC1=C2OC(C)(C)C=CC2=C(O)C2=C1OC1=CC=C(O)C=C1C2=O)OO
CNP0318279.1,1,1,2,CNP0318279.1,CNP0318279.1_out_converted.sdf,COC(=O)N1C=C([C@](OC)(C2=CNC3=CC(Br)=CC=C23)S(=O)(=O)O)C2=CC=C(Br)C=C21
CNP0317733.1,1,1,2,CNP0317733.1,CNP0317733.1_out_converted.sdf,C=C1C(=O)O[C@H](C2=COC=C2)C[C@@H]1[C@@]1(C)CC=C[C@@]2(C)COC(=O)[C@H]12
CNP0308290.1,1,1,2,CNP0308290.1,CNP0308290.1_out_converted.sdf,C[C@@H]1CC(=O)[C@]23CO[C@@H](O)[C@@H]2CC(=O)C[C@@H]3[C@@]12C[C@@H](C1=COC=C1)OC2=O
CNP0307229.6,1,1,2,CNP0307229.6,CNP0307229.6_out_converted.sdf,O=C(OC1C[C@H]2CC[C@@H](C1)N2)[C@@H](CO)C1=CC=CC=C1
CNP0302706.2,1,1,2,CNP0302706.2,CNP0302706.2_out_converted.sdf,C[C@H]1C(=O)C2=C(O)C=C(O)C=C2O[C@H]1C1=CC=C(O)C=C1
CNP0363021.1,1,1,2,CNP0363021.1,CNP0363021.1_out_converted.sdf,O[C@@]12COC3=CC4=C(C=CO4)C=C3[C@@H]1OC1=CC3=C(C=C12)OCO3
CNP0365331.1,1,1,2,CNP0365331.1,CNP0365331.1_out_converted.sdf,O=C1C[C@@H](C2=CC=C3OCC=CC3=C2)OC2=CC(O)=CC=C12
CNP0399207.2,1,1,2,CNP0399207.2,CNP0399207.2_out_converted.sdf,O=C1C[C@H]2O[C@H]([C@H](O)C3=CC=CC=C3)[C@@H](O)[C@@H]2O1
CNP0425372.1,1,1,2,CNP0425372.1,CNP0425372.1_out_converted.sdf,CC[C@H](C)[C@H](C1=C(O)C(C)(C)C(=O)C(C)(C)C1=O)C1=C(O)C(C)=C(O)C(C(=O)CCC2=CC=CC=C2)=C1O
CNP0426639.0,1,1,2,CNP0426639.0,CNP0426639.0_out_converted.sdf,O=C1C2=C(COC3=CC=CC(O)=C32)OC2=CC3=C(OCO3)C(O)=C12
CNP0165737.1,1,1,2,CNP0165737.1,CNP0165737.1_out_converted.sdf,C/C=C(\C=C(C)\C=C\CC/C=C(\C)C(=O)[C@]12O[C@H]1[C@@](C)(O)NC2=O)C(=O)OC
CNP0426155.1,1,1,2,CNP0426155.1,CNP0426155.1_out_converted.sdf,CC[C@H](C)[C@@H]1NCC[C@]12C(=O)NC1=CC(O)=CC=C12
CNP0276575.3,1,1,2,CNP0276575.3,CNP0276575.3_out_converted.sdf,O=C1C=CC[C@@H](/C=C/C[C@H](O)C[C@@H](O)/C=C/C2=CC=CC=C2)O1
CNP0286670.1,1,1,2,CNP0286670.1,CNP0286670.1_out_converted.sdf,C=CC(C)(C)C1=CC([C@H]2COC3=CC(O)=CC=C3C2)=C(O)C(O)=C1OC
CNP0228022.1,1,1,2,CNP0228022.1,CNP0228022.1_out_converted.sdf,CC1=CC=C(C(=O)O[C@@H]2C=C3CCN(C)[C@H]3[C@H](C3=CC4=C(C=C3C(=O)O)OCO4)C2)C(C)=N1
CNP0425515.0,1,1,2,CNP0425515.0,CNP0425515.0_out_converted.sdf,COC(=O)C1=C(C2=CNC3=CC(Cl)=C(Cl)C=C23)C(C2=CNC3=CC=C(Cl)C=C23)=CN1
CNP0425203.1,1,1,2,CNP0425203.1,CNP0425203.1_out_converted.sdf,CC1(C)CCCC(=O)[C@H]1CCC1=C[C@@H](O)CC1=O
CNP0212403.1,1,1,2,CNP0212403.1,CNP0212403.1_out_converted.sdf,COC1=C([C@]2(O)COC3=CC(O)=CC(O)=C3C2=O)C=CC(O)=C1CC=C(C)C
CNP0424844.1,1,1,2,CNP0424844.1,CNP0424844.1_out_converted.sdf,COC1=CC2=C(C(O)=C1C1=CC(=O)CC1)[C@@H]1C=CO[C@@H]1O2
CNP0424373.0,1,1,2,CNP0424373.0,CNP0424373.0_out_converted.sdf,CN1C2=CC3=C(C=C2C(=O)C2=C1OC=C2)OCO3
CNP0424274.0,1,1,2,CNP0424274.0,CNP0424274.0_out_converted.sdf,COC1=CC(O)=C(C2=COC3=CC(O)=CC(O)=C3C2=O)C=C1CC=C(C)C
CNP0424223.0,1,1,2,CNP0424223.0,CNP0424223.0_out_converted.sdf,OC1=CC=C2C(C3=C(O)C=CC4=CC(O)=CC=C34)=C(O)C=CC2=C1
CNP0412522.2,1,1,2,CNP0412522.2,CNP0412522.2_out_converted.sdf,COC1=CC2=C(C=C1[C@H]1COC3=CC(O)=CC=C3C1)OCO2
CNP0406736.1,1,1,2,CNP0406736.1,CNP0406736.1_out_converted.sdf,C[C@H](O)/C=C1\C[C@H](O)[C@@]23C[C@@H]2C(C)(C)O[C@@]3(O)C1=O
CNP0179144.1,1,1,2,CNP0179144.1,CNP0179144.1_out_converted.sdf,COC1=CC2=C(CN3C[C@@H](O)[C@]24CC[C@H](OC)C[C@@H]34)C(OC)=C1O
CNP0226086.2,1,1,2,CNP0226086.2,CNP0226086.2_out_converted.sdf,C=C1C(=O)O[C@@H](/C=C(\C)CO)[C@H]1[C@@H](C/C(C)=C/CO)OC(=O)/C(C)=C\C
1 Cleaned_Title Karma_Count Vina_Count Total_Count Karma_Original_Titles Vina_Original_Titles smiles
2 LICODIONE 4 9 13 LICODIONE LICODIONE_out_converted.sdf O=C(/C=C(\[O-])c1ccc(O)cc1)c1ccc(O)cc1O
3 CPD-591 4 7 11 CPD-591 CPD-591_out_converted.sdf [O-]c1cc2c([O-])cc(O)cc2[o+]c1-c1ccc(O)c(O)c1
4 CPD-17791 4 5 9 CPD-17791 CPD-17791_out_converted.sdf CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)O)O)O)O
5 CPD-6366 3 4 7 CPD-6366 CPD-6366_out_converted.sdf COc1ccc2c(c1)OC(O)C(=O)N2[O-]
6 CNP0359851.1 3 3 6 CNP0359851.1 CNP0359851.1_out_converted.sdf CC1(C)OC2=C(C[C@H]1O)C(O)=CC(O)=C2C(=O)/C=C/C1=CC=CC=C1
7 CNP0360022.0 3 3 6 CNP0360022.0 CNP0360022.0_out_converted.sdf COC1=CC(OCC=C(C)C)=C(OC)C(C2=COC3=CC(O)=CC(O)=C3C2=O)=C1
8 CNP0288448.3 1 5 6 CNP0288448.3 CNP0288448.3_out_converted.sdf O=C1C=C(/C=C/C2=CC=C(O)C=C2)[C@@H]2C(=O)[C@H]1[C@@H](C1=CC=C(O)C=C1)[C@@H]2C1=CC(O)=CC(O)=C1
9 CNP0574550.1 3 3 6 CNP0574550.1 CNP0574550.1_out_converted.sdf C=C1[O+]=C(OC2=C([O-])CC(OC3=C([O-])CC(/[O+]=C4\CC[C@@]5(C)[C@H](CC[C@@H]6[C@@H]5CC(=O)[C@]5(C)C(C7=C=C([O-])OC7)CC[C@]65[O-])C4)=[O+]C3=C)=[O+]C2=C)CC(=O)C1=O
10 CNP0076684.0 3 3 6 CNP0076684.0 CNP0076684.0_out_converted.sdf COC1=C(O)C=CC=C1C1=C(O)C(=O)C2=CC(O)=CC(O)=C2O1
11 CNP0124299.0 3 3 6 CNP0124299.0 CNP0124299.0_out_converted.sdf O=C(C1=CN=C2C=CC=CC2=C1)C1=NC(=O)C2=CC=CC=C2N1
12 CNP0105988.1 3 3 6 CNP0105988.1 CNP0105988.1_out_converted.sdf COC1=CC(OC)=C([C@H]2COC3=CC(O)=CC(O)=C3C2=O)C=C1O
13 CNP0256385.0 1 4 5 CNP0256385.0 CNP0256385.0_out_converted.sdf O=C1C=C(C2=CC=C(O)C=C2)OC2C=C(O)CC(O)=C12
14 BUTEIN 2 3 5 BUTEIN BUTEIN_out_converted.sdf O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O)cc1O
15 CPD-12949 2 2 4 CPD-12949 CPD-12949_out_converted.sdf C[C@]1(O)CC(=O)[C@@H]2c3cc4cccc(O)c4c(O)c3C(=O)C[C@]2(O)C1
16 CNP0404986.0 2 2 4 CNP0404986.0 CNP0404986.0_out_converted.sdf O=C(C1=CC=C2OCOC2=C1O)C1=NC=CC2=CC3=C(C=C12)OCO3
17 CNP0426012.0 2 2 4 CNP0426012.0 CNP0426012.0_out_converted.sdf COC1=CC(O)=C(O)C=C1C1=COC2=CC(O)=CC=C2C1=O
18 CNP0426334.0 2 2 4 CNP0426334.0 CNP0426334.0_out_converted.sdf COC1=C(O)C=C2C(=O)C3=CC=C(O)C=C3OC2=C1OC
19 CNP0497639.1 3 1 4 CNP0497639.1 CNP0497639.1_out_converted.sdf C[S@+]([O-])C1=C(CN2C3=CC=CC=C3C3=CC=NC=C32)NC=N1
20 CNP0348231.1 2 2 4 CNP0348231.1 CNP0348231.1_out_converted.sdf C=C(C)[C@@H](O)CC/C(C)=C/CC1=C(OC)C(O)=CC2=C1C(=O)C1=C(O)C=C(O)C=C1O2
21 CNP0524070.0 2 2 4 CNP0524070.0 CNP0524070.0_out_converted.sdf O=C1CC(=C2C=CC(O)C=C2)OC2=CC(O)=CC(O)=C12
22 CNP0301960.1 2 2 4 CNP0301960.1 CNP0301960.1_out_converted.sdf O=C1C2=CC(O)=CC=C2N=C2[C@@H](O)CCN21
23 CPD-12724 2 2 4 CPD-12724 CPD-12724_out_converted.sdf O=c1cc(-c2ccccc2)oc2cc(O)c(O)c(O)c12
24 CPD-12726 2 2 4 CPD-12726 CPD-12726_out_converted.sdf O=c1cc(-c2ccc(O)cc2)oc2cc(O)c(O)c(O)c12
25 574-TRIHYDROXY-3-METHOXYFLAVONE 2 2 4 574-TRIHYDROXY-3-METHOXYFLAVONE 574-TRIHYDROXY-3-METHOXYFLAVONE_out_converted.sdf COc1cc(-c2cc(=O)c3c(O)cc([O-])cc3o2)ccc1O
26 CPD-15721 2 2 4 CPD-15721 CPD-15721_out_converted.sdf CCC1(CC(=O)C2=C(C3=C(C=C2C1C(=O)OC)C(=O)C4=C(C3=O)C(=CC=C4)O)O)O
27 CPD-17024 2 2 4 CPD-17024 CPD-17024_out_converted.sdf C1=CC(=C(C(=C1)O)C(=O)C2=CC(=C(N2)Cl)Cl)O
28 CPD-17457 2 2 4 CPD-17457 CPD-17457_out_converted.sdf C1C2C(C3(C=CC(=O)C=C3O1)O)OC4=CC5=C(C=C24)OCO5
29 CNP0234857.5 2 2 4 CNP0234857.5 CNP0234857.5_out_converted.sdf CC(=O)O[C@@H]1C=C[C@@]23C4=CC5=C(C=C4CN(C[C@H]2O)[C@H]3C1)OCO5
30 CPD-19273 1 3 4 CPD-19273 CPD-19273_out_converted.sdf CC1=C2C=CC=C(C2=C(C3=C1C=C4C=C(C(=C(C4=C3O)[O-])C(=O)N)O)O)O
31 CPD-21169 2 2 4 CPD-21169 CPD-21169_out_converted.sdf C1=CC=C(C=C1)C2=NC(=CC(=N2)Cl)SCC(C(=O)[O-])[NH3+]
32 CPD-3622 2 2 4 CPD-3622 CPD-3622_out_converted.sdf COc1ccc(-c2coc3cc([O-])cc(O)c3c2=O)cc1O
33 CPD-3629 2 2 4 CPD-3629 CPD-3629_out_converted.sdf O=c1c(-c2cc3c(cc2O)OCO3)coc2cc(O)ccc12
34 CPD-3630 2 2 4 CPD-3630 CPD-3630_out_converted.sdf O=C1c2ccc(O)cc2OC[C@H]1c1cc2c(cc1O)OCO2
35 CPD-431 2 2 4 CPD-431 CPD-431_out_converted.sdf O=c1cc(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12
36 CPD-7096 2 2 4 CPD-7096 CPD-7096_out_converted.sdf [O-]c1cc(O)cc2[o+]c(-c3ccc(O)cc3)ccc12
37 CPD-9539 2 2 4 CPD-9539 CPD-9539_out_converted.sdf COc1ccc(-c2coc3cc(O)ccc3c2=O)cc1O
38 CPD1F-90 1 3 4 CPD1F-90 CPD1F-90_out_converted.sdf O=c1c(O)c(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12
39 PHOTINUS-LUCIFERIN 2 2 4 PHOTINUS-LUCIFERIN PHOTINUS-LUCIFERIN_out_converted.sdf O=C([O-])[C@H]1CSC(c2nc3ccc(O)cc3s2)=N1
40 CPD-17787 2 2 4 CPD-17787 CPD-17787_out_converted.sdf CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)OC)O)OC)O
41 CNP0352637.2 2 2 4 CNP0352637.2 CNP0352637.2_out_converted.sdf COC1=CC(O)=CC([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)=C1
42 VESTITONE 2 2 4 VESTITONE VESTITONE_out_converted.sdf COc1ccc([C@@H]2COc3cc(O)ccc3C2=O)c(O)c1
43 C12135 2 2 4 C12135 C12135_out_converted.sdf COC1=CC(=C(C=C1)C2=COC3=CC(=CC(=C3C2=O)O)O)O
44 CNP0076169.0 2 2 4 CNP0076169.0 CNP0076169.0_out_converted.sdf CC1=C(O)C=C(O)C2=C1OC(C1=CC=C(O)C=C1)=CC2=O
45 CNP0078636.1 2 2 4 CNP0078636.1 CNP0078636.1_out_converted.sdf COC1=C(O)C(O)=C2C(=O)[C@H](CC3=CC=C(O)C=C3)COC2=C1OC
46 CNP0080149.0 2 2 4 CNP0080149.0 CNP0080149.0_out_converted.sdf COC1=C(O)C=C2C(=O)C3=C(CCCC3)C(=O)C2=C1O
47 CNP0106634.1 2 2 4 CNP0106634.1 CNP0106634.1_out_converted.sdf CC1(C)C=CC2=C(O)C=CC([C@H]3CC(=O)C4=C(O)C=C(O)C=C4O3)=C2O1
48 CNP0106817.1 2 2 4 CNP0106817.1 CNP0106817.1_out_converted.sdf COC1=CC=C([C@@H](C)C(=O)C2=CC=C(OC)C=C2O)C=C1
49 CNP0126704.1 2 2 4 CNP0126704.1 CNP0126704.1_out_converted.sdf COC1=CC(C2=COC3=CC(O)=CC(O)=C3C2=O)=CC2=C1O[C@@H](C(C)(C)O)C2
50 CNP0143640.1 2 2 4 CNP0143640.1 CNP0143640.1_out_converted.sdf COC(=O)[C@H]1COC(C2=CC=CC=C2O)=N1
51 CNP0145895.1 2 2 4 CNP0145895.1 CNP0145895.1_out_converted.sdf COC1=C(C)C(O)=C(C(C)=O)C2=C1[C@]1(C)C(=O)C(C(C)=O)=C(O)C[C@]1(OC)O2
52 CNP0172162.2 2 2 4 CNP0172162.2 CNP0172162.2_out_converted.sdf COC1=C(C)C(=O)C2=C(C=C(O)C3=C2O[C@H](C)[C@]3(C)[C@H](O)C/C=C(/C)CO)C1=O
53 CNP0354771.1 1 2 3 CNP0354771.1 CNP0354771.1_out_converted.sdf C[C@H](CC(=O)C[C@H](C)[C@@H]1C[C@H](O)[C@]2(C)C3=C(C(=O)C[C@@]12C)[C@]1(C)CCC(=O)C(C)(C)[C@@H]1CC3=O)C(=O)O
54 CNP0426121.1 1 2 3 CNP0426121.1 CNP0426121.1_out_converted.sdf COC1=CC=C([C@@H]2CC(=O)C3=C(O)C=C(O)C(CC(O)=C(C)C)=C3O2)C=C1
55 CNP0426105.1 1 2 3 CNP0426105.1 CNP0426105.1_out_converted.sdf COC1=CC=C([C@@H]2CCC3=C(O)C=C(O)C=C3O2)C=C1
56 CNP0401138.0 1 2 3 CNP0401138.0 CNP0401138.0_out_converted.sdf O=S(=O)(O)C1=CC=C2C(=C1)OC1=CC=CC=C12
57 CNP0425564.0 1 2 3 CNP0425564.0 CNP0425564.0_out_converted.sdf COC(=O)C1=CC=C(OC)C(OC2=CC=C(C(=O)O)C=C2)=C1
58 CNP0213003.1 1 2 3 CNP0213003.1 CNP0213003.1_out_converted.sdf CN1C(=N)N(C)[C@@H](CC2=CNC3=CC=CC=C23)C1=O
59 CNP0279015.1 1 2 3 CNP0279015.1 CNP0279015.1_out_converted.sdf C[C@@H]1CC[C@H]2C(C)(C)CCC[C@]2(C)/C1=C/C1=CC(O)=C(O)C(/C=N/CCS(=O)(=O)O)=C1OS(=O)(=O)O
60 CNP0300571.1 1 1 2 CNP0300571.1 CNP0300571.1_out_converted.sdf CO[C@@H]1C[C@@H](C2=CC=C(O)C=C2)OC2=CC(O)=CC=C21
61 CPD-15172 1 1 2 CPD-15172 CPD-15172_out_converted.sdf C1=CC=C(C=C1)C2=CC(=O)C3=C(O2)C=C(C(=O)C3=O)O
62 CNP0138834.1 1 1 2 CNP0138834.1 CNP0138834.1_out_converted.sdf CC(C)=CCC1=CC([C@@H]2COC3=CC(O)=CC(O)=C3C2=O)=CC(O)=C1O
63 CNP0139950.1 1 1 2 CNP0139950.1 CNP0139950.1_out_converted.sdf C=C(C)[C@@H](O)CC1=CC(C2=CC(=O)C3=C(O)C=C(O)C=C3O2)=CC2=C1OC(C)(C)C=C2
64 CPD-17072 1 1 2 CPD-17072 CPD-17072_out_converted.sdf CC(=O)C1=C(C(NC1=O)CC2=CNC3=CC=CC=C32)[O-]
65 CNP0142417.1 1 1 2 CNP0142417.1 CNP0142417.1_out_converted.sdf C=C(C)[C@@H]1CC2=C(O)C=CC(C3=COC4=CC(O)=CC(O)=C4C3=O)=C2O1
66 CPD-15777 1 1 2 CPD-15777 CPD-15777_out_converted.sdf CC1C(=CC=CN1CC2=CN=C(N=C2N)C)CCO
67 CNP0142712.2 1 1 2 CNP0142712.2 CNP0142712.2_out_converted.sdf COC1=CC2=C(C=C1OC)[C@@H](C(=O)C1=CC=C3OC=CC3=C1O)CCO2
68 CPD-15265 1 1 2 CPD-15265 CPD-15265_out_converted.sdf C1CC2C(=O)NC(C(=O)N2C1)CC3=CNC4=CC=CC=C43
69 CPD-14164 1 1 2 CPD-14164 CPD-14164_out_converted.sdf C1=CC(=C(C(=C1)O)O)C2=C(C(=CC=C2)O)O
70 CNP0135275.1 1 1 2 CNP0135275.1 CNP0135275.1_out_converted.sdf C=C1CC[C@@H](O)[C@]2(C)CC[C@@H]([C@@H](C)C(=O)O)C[C@H]12
71 CNP0260962.2 1 1 2 CNP0260962.2 CNP0260962.2_out_converted.sdf CC1=CC(C)(C)C(C=O)=C[C@@H]1OC(=O)/C=C(\C)CO
72 CNP0273031.1 1 1 2 CNP0273031.1 CNP0273031.1_out_converted.sdf C[C@@H]1CC[C@]2(C)[C@H](CO)CCC[C@@H]2[C@@]1(C)CC/C(=C/CO)CO
73 CNP0146656.1 1 1 2 CNP0146656.1 CNP0146656.1_out_converted.sdf C[C@@H]1C(=O)O[C@H]2[C@H]1[C@@H](O)C[C@@]1(C)C(=O)C=C[C@](C)(O)[C@H]21
74 CPD-12022 1 1 2 CPD-12022 CPD-12022_out_converted.sdf COc1ccc(NC=O)c(C(=O)CCNC(C)=O)c1
75 CPD-11556 1 1 2 CPD-11556 CPD-11556_out_converted.sdf Cc1cc(O)cc2c1C(=O)CC(O)(Cc1cc([O-])cc(=O)o1)O2
76 CPD-11553 1 1 2 CPD-11553 CPD-11553_out_converted.sdf Cc1cc(O)cc(O)c1-c1cc([O-])cc(=O)o1
77 CPD-10176 1 1 2 CPD-10176 CPD-10176_out_converted.sdf O=C1c2cc([O-])cc(O)c2C(=O)c2c1cc1c(c2O)[C@@H]2C=CO[C@@H]2O1
78 CNP0136916.1 1 1 2 CNP0136916.1 CNP0136916.1_out_converted.sdf COC1=CC(C)=C(C(=O)O[C@@H]2C[C@]3(C)[C@H]4[C@@H](O)C(C)(C)C[C@@H]4C=C(CO)[C@]23O)C(O)=C1
79 CPD-19294 1 1 2 CPD-19294 CPD-19294_out_converted.sdf C1C(OC2=C(C1=O)C=CC(=C2)O)C3=CC(=C(C=C3)O)O
80 CNP0148342.3 1 1 2 CNP0148342.3 CNP0148342.3_out_converted.sdf COC1=CC=C2C(=C1)OC[C@@]1(O)C3=CC4=C(C=C3O[C@@H]21)OCO4
81 CNP0260370.1 1 1 2 CNP0260370.1 CNP0260370.1_out_converted.sdf CC1=C(C)C2=CC(=O)[C@](C)(O)[C@H](O)[C@@H]2CO1
82 CNP0232261.2 1 1 2 CNP0232261.2 CNP0232261.2_out_converted.sdf COC1=CC=C([C@H]2COC3=CC(O)=CC=C3C2)C(O)=C1O
83 C16405 1 1 2 C16405 C16405_out_converted.sdf COC1=C(C=CC(=C1)C=CC(=O)C2=C(C=C(C=C2O)O)O)O
84 DCPIP 1 1 2 DCPIP DCPIP_out_converted.sdf O=C1C(Cl)=CC(=Nc2ccc(O)cc2)C=C1Cl
85 C19588 1 1 2 C19588 C19588_out_converted.sdf COC1=C2C3=C(C(=O)CC3)C(=O)OC2=C4C5C(C(OC5OC4=C1)O)O
86 CPD-9557 1 1 2 CPD-9557 CPD-9557_out_converted.sdf Cc1cc(O)c2c(c1)Cc1cc(O)cc(O)c1C2=O
87 CNP0216219.17 1 1 2 CNP0216219.17 CNP0216219.17_out_converted.sdf CO[C@H]1C=C[C@@]23C4=CC5=C(C=C4[C@H](O)[N@@](C[C@@H]2O)[C@H]3C1)OCO5
88 CPD-8215 1 1 2 CPD-8215 CPD-8215_out_converted.sdf Cc1cc(O)c2c(c1)C(=O)c1cc([O-])cc(O)c1C2=O
89 CNP0243499.1 1 1 2 CNP0243499.1 CNP0243499.1_out_converted.sdf COC(=O)/C(C)=C/[C@@H](O)C1=CC2=CC=C(OC)C(O)=C2OC1=O
90 CPD-7027 1 1 2 CPD-7027 CPD-7027_out_converted.sdf COc1cc2c(=O)c(-c3ccc(O)cc3)coc2cc1O
91 CPD-6955 1 1 2 CPD-6955 CPD-6955_out_converted.sdf O=C(/C=C/c1ccc(O)cc1)Cc1cc([O-])cc(=O)o1
92 CPD-693 1 1 2 CPD-693 CPD-693_out_converted.sdf CC1=CC(=O)CC(C)(C)[C@@]1(O)/C=C/C(C)=C\C(=O)[O-]
93 CNP0248017.2 1 1 2 CNP0248017.2 CNP0248017.2_out_converted.sdf CCC(=O)NCC[C@@]1(O)C2=C(N=CS2)C2=NC=CC3=C4C=CC=CC4=NC1=C23
94 CNP0090058.1 1 1 2 CNP0090058.1 CNP0090058.1_out_converted.sdf CCOC(=O)C1=C(C=O)C2=C3CCC(=O)[C@H](C3)C3=CC4=C(C=C3CC[C@@H](CO)COC3=C2C(=C(CO)C2=C3C[C@@H]([C@](C)(O)CCCO)O2)O1)NC=C4
95 CPD-498 1 1 2 CPD-498 CPD-498_out_converted.sdf C=C1C[C@]23C[C@H]1CC[C@@H]2C1=CC(=O)C[C@@](C)(C(=O)[O-])[C@H]1[C@@H]3C(=O)[O-]
96 CNP0251052.1 1 1 2 CNP0251052.1 CNP0251052.1_out_converted.sdf C[C@@H](CO)[C@@H](C)C[C@@H](O)[C@](C)(O)[C@H]1CC[C@@]2(O)C3=CC(=O)[C@]4(O)C[C@@H](O)[C@@H](O)C[C@]4(C)[C@H]3CC[C@]12C
97 CNP0120483.1 1 1 2 CNP0120483.1 CNP0120483.1_out_converted.sdf C=C(C)C1=CC2=C3O[C@H]4C5=CC=C(O)C=C5OC[C@@]4(O)C3=CC=C2O1
98 CNP0122740.1 1 1 2 CNP0122740.1 CNP0122740.1_out_converted.sdf C=C1CC[C@@H]2[C@@](C)(CO)C[C@@H](O)C[C@]2(C)[C@@H]1CCC1=CC(=O)OC1
99 CNP0147378.1 1 1 2 CNP0147378.1 CNP0147378.1_out_converted.sdf CC(=O)OC[C@]12O[C@]34C=C(C)C(=O)C[C@]3(C)[C@@]1(C)[C@H](O)C[C@H]2O4
100 CNP0153223.1 1 1 2 CNP0153223.1 CNP0153223.1_out_converted.sdf CCC[C@H]1OCC2=C([C@@H](O)[C@@H]3O[C@@H]3C2=O)[C@@H]1O
101 CNP0210135.1 1 1 2 CNP0210135.1 CNP0210135.1_out_converted.sdf COC1=CC=C2C(=C1)O[C@@H]1C3=CC(OC)=C(O)C=C3OC[C@H]21
102 CNP0426805.1 1 1 2 CNP0426805.1 CNP0426805.1_out_converted.sdf CC[C@@H]1CN2CCC3=C(NC4=CC=CC=C34)C2=C/C1=C(\C=O)C(=O)OC
103 CNP0191964.1 1 1 2 CNP0191964.1 CNP0191964.1_out_converted.sdf C/C=C(\C)[C@H]1OC2=C(C(=O)NC=C2C2=CC=CC=C2)[C@]1(C)CO
104 CNP0199889.5 1 1 2 CNP0199889.5 CNP0199889.5_out_converted.sdf C=C1C(=O)O[C@@H]2CC(C)=C([C@@H](C)CCCOC(C)=O)[C@H](O)[C@@H]12
105 CNP0359684.0 1 1 2 CNP0359684.0 CNP0359684.0_out_converted.sdf COC1=CC2=C(OC3=CC=C(O)C=C3C2=O)C(OC)=C1OC
106 CNP0292276.1 1 1 2 CNP0292276.1 CNP0292276.1_out_converted.sdf C[C@H](C(=O)O)[C@H]1CCC2=CC(=O)C[C@H](C)[C@@]2(C)C1
107 CNP0353297.0 1 1 2 CNP0353297.0 CNP0353297.0_out_converted.sdf COC1=CC2=C(C=C1O)C(C=O)=C(C1=CC=C(O)C=C1O)O2
108 CNP0204494.2 1 1 2 CNP0204494.2 CNP0204494.2_out_converted.sdf COC1=CC(O)=CC2=C1C(=O)[C@@H](C1=CC=C3OC(C)(C)C=CC3=C1O)CO2
109 CNP0352117.1 1 1 2 CNP0352117.1 CNP0352117.1_out_converted.sdf CC[C@H](C)C[C@@]1(C)C=C/C(=C2/C(=O)O[C@H](CC(=O)O)C2=O)O1
110 CNP0204979.2 1 1 2 CNP0204979.2 CNP0204979.2_out_converted.sdf CC(C)=CCC1=C([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)C=CC(O)=C1O
111 CNP0342898.3 1 1 2 CNP0342898.3 CNP0342898.3_out_converted.sdf COC1=CC(O)=CC([C@H](/C=C\C2=CC=C(O)C=C2)[C@@H](O)CO)=C1
112 CNP0329144.1 1 1 2 CNP0329144.1 CNP0329144.1_out_converted.sdf C=C(C)[C@@H](O)CC1=C(OC)C(=O)C2=CC(OC)=C(O)C(O)=C2C1=O
113 CNP0328219.1 1 1 2 CNP0328219.1 CNP0328219.1_out_converted.sdf C=C1[C@@H](O)C[C@H]2C(C)(C)C[C@H](Cl)C[C@@]2(C)[C@H]1C[C@H](O)[C@H]1CC(=O)NC1=O
114 CNP0320138.1 1 1 2 CNP0320138.1 CNP0320138.1_out_converted.sdf C=C(C)[C@@H](CC1=C2OC(C)(C)C=CC2=C(O)C2=C1OC1=CC=C(O)C=C1C2=O)OO
115 CNP0318279.1 1 1 2 CNP0318279.1 CNP0318279.1_out_converted.sdf COC(=O)N1C=C([C@](OC)(C2=CNC3=CC(Br)=CC=C23)S(=O)(=O)O)C2=CC=C(Br)C=C21
116 CNP0317733.1 1 1 2 CNP0317733.1 CNP0317733.1_out_converted.sdf C=C1C(=O)O[C@H](C2=COC=C2)C[C@@H]1[C@@]1(C)CC=C[C@@]2(C)COC(=O)[C@H]12
117 CNP0308290.1 1 1 2 CNP0308290.1 CNP0308290.1_out_converted.sdf C[C@@H]1CC(=O)[C@]23CO[C@@H](O)[C@@H]2CC(=O)C[C@@H]3[C@@]12C[C@@H](C1=COC=C1)OC2=O
118 CNP0307229.6 1 1 2 CNP0307229.6 CNP0307229.6_out_converted.sdf O=C(OC1C[C@H]2CC[C@@H](C1)N2)[C@@H](CO)C1=CC=CC=C1
119 CNP0302706.2 1 1 2 CNP0302706.2 CNP0302706.2_out_converted.sdf C[C@H]1C(=O)C2=C(O)C=C(O)C=C2O[C@H]1C1=CC=C(O)C=C1
120 CNP0363021.1 1 1 2 CNP0363021.1 CNP0363021.1_out_converted.sdf O[C@@]12COC3=CC4=C(C=CO4)C=C3[C@@H]1OC1=CC3=C(C=C12)OCO3
121 CNP0365331.1 1 1 2 CNP0365331.1 CNP0365331.1_out_converted.sdf O=C1C[C@@H](C2=CC=C3OCC=CC3=C2)OC2=CC(O)=CC=C12
122 CNP0399207.2 1 1 2 CNP0399207.2 CNP0399207.2_out_converted.sdf O=C1C[C@H]2O[C@H]([C@H](O)C3=CC=CC=C3)[C@@H](O)[C@@H]2O1
123 CNP0425372.1 1 1 2 CNP0425372.1 CNP0425372.1_out_converted.sdf CC[C@H](C)[C@H](C1=C(O)C(C)(C)C(=O)C(C)(C)C1=O)C1=C(O)C(C)=C(O)C(C(=O)CCC2=CC=CC=C2)=C1O
124 CNP0426639.0 1 1 2 CNP0426639.0 CNP0426639.0_out_converted.sdf O=C1C2=C(COC3=CC=CC(O)=C32)OC2=CC3=C(OCO3)C(O)=C12
125 CNP0165737.1 1 1 2 CNP0165737.1 CNP0165737.1_out_converted.sdf C/C=C(\C=C(C)\C=C\CC/C=C(\C)C(=O)[C@]12O[C@H]1[C@@](C)(O)NC2=O)C(=O)OC
126 CNP0426155.1 1 1 2 CNP0426155.1 CNP0426155.1_out_converted.sdf CC[C@H](C)[C@@H]1NCC[C@]12C(=O)NC1=CC(O)=CC=C12
127 CNP0276575.3 1 1 2 CNP0276575.3 CNP0276575.3_out_converted.sdf O=C1C=CC[C@@H](/C=C/C[C@H](O)C[C@@H](O)/C=C/C2=CC=CC=C2)O1
128 CNP0286670.1 1 1 2 CNP0286670.1 CNP0286670.1_out_converted.sdf C=CC(C)(C)C1=CC([C@H]2COC3=CC(O)=CC=C3C2)=C(O)C(O)=C1OC
129 CNP0228022.1 1 1 2 CNP0228022.1 CNP0228022.1_out_converted.sdf CC1=CC=C(C(=O)O[C@@H]2C=C3CCN(C)[C@H]3[C@H](C3=CC4=C(C=C3C(=O)O)OCO4)C2)C(C)=N1
130 CNP0425515.0 1 1 2 CNP0425515.0 CNP0425515.0_out_converted.sdf COC(=O)C1=C(C2=CNC3=CC(Cl)=C(Cl)C=C23)C(C2=CNC3=CC=C(Cl)C=C23)=CN1
131 CNP0425203.1 1 1 2 CNP0425203.1 CNP0425203.1_out_converted.sdf CC1(C)CCCC(=O)[C@H]1CCC1=C[C@@H](O)CC1=O
132 CNP0212403.1 1 1 2 CNP0212403.1 CNP0212403.1_out_converted.sdf COC1=C([C@]2(O)COC3=CC(O)=CC(O)=C3C2=O)C=CC(O)=C1CC=C(C)C
133 CNP0424844.1 1 1 2 CNP0424844.1 CNP0424844.1_out_converted.sdf COC1=CC2=C(C(O)=C1C1=CC(=O)CC1)[C@@H]1C=CO[C@@H]1O2
134 CNP0424373.0 1 1 2 CNP0424373.0 CNP0424373.0_out_converted.sdf CN1C2=CC3=C(C=C2C(=O)C2=C1OC=C2)OCO3
135 CNP0424274.0 1 1 2 CNP0424274.0 CNP0424274.0_out_converted.sdf COC1=CC(O)=C(C2=COC3=CC(O)=CC(O)=C3C2=O)C=C1CC=C(C)C
136 CNP0424223.0 1 1 2 CNP0424223.0 CNP0424223.0_out_converted.sdf OC1=CC=C2C(C3=C(O)C=CC4=CC(O)=CC=C34)=C(O)C=CC2=C1
137 CNP0412522.2 1 1 2 CNP0412522.2 CNP0412522.2_out_converted.sdf COC1=CC2=C(C=C1[C@H]1COC3=CC(O)=CC=C3C1)OCO2
138 CNP0406736.1 1 1 2 CNP0406736.1 CNP0406736.1_out_converted.sdf C[C@H](O)/C=C1\C[C@H](O)[C@@]23C[C@@H]2C(C)(C)O[C@@]3(O)C1=O
139 CNP0179144.1 1 1 2 CNP0179144.1 CNP0179144.1_out_converted.sdf COC1=CC2=C(CN3C[C@@H](O)[C@]24CC[C@H](OC)C[C@@H]34)C(OC)=C1O
140 CNP0226086.2 1 1 2 CNP0226086.2 CNP0226086.2_out_converted.sdf C=C1C(=O)O[C@@H](/C=C(\C)CO)[C@H]1[C@@H](C/C(C)=C/CO)OC(=O)/C(C)=C\C

View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
兼容你这版 ChemPlot (from_smiles(sim_type=...), .umap(), .cluster(), .interactive_plot())
流程:
1) 原始数据 -> UMAP+KMeans -> 选 Top-30 -> 导出 selected_top30.csv
+ 交互图 chemplot_interactive_original.html用原始 Total_Count 着色)
2) 将 Top-30 的 Total_Count=100 -> 交互图 chemplot_interactive_marked.html突出显示
"""
import argparse
from pathlib import Path
import numpy as np
import pandas as pd
from chemplot import Plotter
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# --- 小工具 ---
def guess_smiles_col(df: pd.DataFrame):
for c in df.columns:
if c.lower() in ("smiles", "smile", "canonical_smiles"):
return c
# 简单启发式兜底
import re
pat = re.compile(r"^[A-Za-z0-9@+\-\[\]\(\)=#\\/]+$")
best, r = None, -1
for c in df.columns:
vals = df[c].dropna().astype(str).head(50).tolist()
ok = [bool(pat.match(s)) and any(ch in s for ch in "CNOH[]()") for s in vals]
ratio = float(np.mean(ok)) if ok else 0.0
if ratio > r:
best, r = c, ratio
return best
def farthest_point_sampling(X: np.ndarray, k: int, seed: int = 42, init: int | None = None):
n = X.shape[0]
if n == 0: return []
if init is None:
init = int(np.argmax(np.linalg.norm(X - X.mean(axis=0), axis=1)))
sel = [init]
dmin = np.linalg.norm(X - X[init], axis=1)
for _ in range(1, min(k, n)):
j = int(np.argmax(dmin))
sel.append(j)
dmin = np.minimum(dmin, np.linalg.norm(X - X[j], axis=1))
return sel
def kmeans_then_diverse(coords: np.ndarray, n_clusters: int, topk: int, seed: int = 42):
km = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto")
labels = km.fit_predict(coords)
# 每簇取“中心最近”样本
picks = []
for c in range(n_clusters):
idx = np.where(labels == c)[0]
if idx.size == 0: continue
center = km.cluster_centers_[c]
j = idx[np.argmin(np.linalg.norm(coords[idx] - center, axis=1))]
picks.append(j)
# 用 FPS 补/裁成 topk保证多样性
if len(picks) < topk:
return labels, farthest_point_sampling(coords, topk, seed=seed)
if len(picks) > topk:
sub = coords[picks]
order = farthest_point_sampling(sub, topk, seed=seed)
return labels, [picks[i] for i in order]
return labels, picks
def static_preview(coords: np.ndarray, selected: list[int], out_png: Path, title: str):
import matplotlib.pyplot as plt
plt.figure(figsize=(7,6))
plt.scatter(coords[:,0], coords[:,1], s=10, alpha=0.5)
if selected:
sel = np.array(selected)
plt.scatter(coords[sel,0], coords[sel,1], s=40, marker='x')
plt.title(title); plt.xlabel("Dim-1"); plt.ylabel("Dim-2")
plt.tight_layout(); plt.savefig(out_png, dpi=200); plt.close()
# --- 主流程 ---
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--csv", default="data/title_intersection_analysis.csv")
ap.add_argument("--outdir", default="chemplot_output")
ap.add_argument("--n-clusters", type=int, default=30)
ap.add_argument("--topk", type=int, default=30)
ap.add_argument("--neighbors", type=int, default=None) # 传给 umap()
ap.add_argument("--min-dist", type=float, default=None) # 传给 umap()
ap.add_argument("--seed", type=int, default=42)
args = ap.parse_args()
outdir = Path(args.outdir); outdir.mkdir(parents=True, exist_ok=True)
df = pd.read_csv(args.csv)
smiles_col = guess_smiles_col(df)
if not smiles_col:
raise RuntimeError("无法识别 SMILES 列;请用 --smiles-col 指定。")
# ------- 第一次:原始数据 -> UMAP -> KMeans -> Top-30 -------
# 用原始 Total_Count 作为 target 做颜色(连续型 => target_type='R'
target_orig = df["Total_Count"].tolist() if "Total_Count" in df.columns else [0]*len(df)
plotter = Plotter.from_smiles(
df[smiles_col].tolist(),
target=target_orig,
target_type="R",
sim_type="structural",
)
# UMAP 降维
# 你这版APIplotter.umap(n_neighbors=?, min_dist=?, pca=False, random_state=?)
emb = plotter.umap(
n_neighbors=args.neighbors,
min_dist=args.min_dist,
pca=False,
random_state=args.seed
)
coords = emb.iloc[:, :2].to_numpy().copy()
# KMeans + 多样性 Top-30
labels, selected_idx = kmeans_then_diverse(coords, n_clusters=args.n_clusters, topk=args.topk, seed=args.seed)
# 保存 selected_top30.csv带上原 DataFrame 的字段)
df_sel = df.iloc[selected_idx].copy()
df_sel.to_csv(outdir / "selected_top30.csv", index=False)
# 保存 embedding_with_labels.csv便于后续分析
df_emb = df.copy()
df_emb["x"] = coords[:,0]; df_emb["y"] = coords[:,1]
df_emb["cluster"] = labels
df_emb["Selected"] = 0
df_emb.loc[df_emb.index.isin(selected_idx), "Selected"] = 1
df_emb.to_csv(outdir / "embedding_with_labels.csv", index=False)
# 交互 HTML原始 Total_Count
plotter.cluster(n_clusters=args.n_clusters, random_state=args.seed)
plotter.interactive_plot(
size=900,
kind="scatter",
remove_outliers=False,
is_colored=True, # 用 target (Total_Count) 上色
clusters=True, # 附带 clusters 标签页
filename=str(outdir / "chemplot_interactive_original.html"),
show_plot=False,
title="UMAP + KMeans (original Total_Count)"
)
# 静态预览图
static_preview(coords, selected_idx, outdir / "scatter_preview.png", "UMAP + KMeans + Top-30")
# ------- 第二次:把 Top-30 的 Total_Count 设为 100 -> 交互 HTML -------
target_marked = df["Total_Count"].tolist() if "Total_Count" in df.columns else [0]*len(df)
for i in selected_idx:
target_marked[i] = 100
plotter2 = Plotter.from_smiles(
df[smiles_col].tolist(),
target=target_marked,
target_type="R",
sim_type="structural",
)
# 为了与第一次布局一致,仍旧跑一次 UMAPrandom_state固定布局稳定
plotter2.umap(
n_neighbors=args.neighbors,
min_dist=args.min_dist,
pca=False,
random_state=args.seed
)
plotter2.cluster(n_clusters=args.n_clusters, random_state=args.seed)
plotter2.interactive_plot(
size=900,
kind="scatter",
remove_outliers=False,
is_colored=True,
clusters=True,
filename=str(outdir / "chemplot_interactive_marked.html"),
show_plot=False,
title="UMAP + KMeans (Top-30 set Total_Count=100)"
)
print("Done. Outputs:", outdir.resolve())
if __name__ == "__main__":
main()