From 92850a0936a8ffa17b08864308e76bae47214216 Mon Sep 17 00:00:00 2001 From: mm644706215 Date: Tue, 16 Sep 2025 20:49:33 +0800 Subject: [PATCH] frist add --- README.md | 40 ++++++ data/title_intersection_analysis.csv | 140 ++++++++++++++++++++ src/chemplot_select_and_mark.py | 185 +++++++++++++++++++++++++++ 3 files changed, 365 insertions(+) create mode 100644 README.md create mode 100644 data/title_intersection_analysis.csv create mode 100644 src/chemplot_select_and_mark.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f904158 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +## 环境安装 + +```bash +conda create -n chemplot_env -y -c conda-forge chemplot +conda activate chemplot_env +# 失败可以用pip 进行安装尝试 与mordred兼容 +pip install "networkx==2.8.8" chemplot pandas matplotlib scikit-learn umap-learn rdkit-pypi +``` + +## t-SNE 的局限性 + +假设有三个分子系列 A、B、C,实际空间距离 A↔B 比 A↔C 近: + +t-SNE 可能只把 A、B 内部的点聚在一起,但 A↔B vs A↔C 的距离无法反映真实远近关系。 + +这意味着如果用 t-SNE 后结果做 KMeans,可能出现不合理的簇划分。 + +## UMAP 的优势 + +在 UMAP 中,A、B、C 的相对距离更接近原始分子指纹空间。 + +当你在降维后的空间运行 KMeans 或 HDBSCAN 时,聚类结果与真实分子结构差异更一致。 + +t-SNE 的时间复杂度接近 O(N^2),数据量增加后计算时间会爆炸式增长。 +UMAP 在大规模分子库上表现更好,并且支持并行计算。 + +UMAP 是更合适的选择,因为: + +它更好地保留了化学空间的全局结构,确保聚类结果有化学意义。 + +稳定、可复现,适合后续湿实验追踪和比较。 + +计算速度更快,支持未来大规模分子库扩展。 + +在生成交互式 HTML 时,结果更直观、可解释。 + +## 功能脚本 + +### chemplot_select_and_mark.py 介绍 + diff --git a/data/title_intersection_analysis.csv b/data/title_intersection_analysis.csv new file mode 100644 index 0000000..fc41c4c --- /dev/null +++ b/data/title_intersection_analysis.csv @@ -0,0 +1,140 @@ +Cleaned_Title,Karma_Count,Vina_Count,Total_Count,Karma_Original_Titles,Vina_Original_Titles,smiles +LICODIONE,4,9,13,LICODIONE,LICODIONE_out_converted.sdf,O=C(/C=C(\[O-])c1ccc(O)cc1)c1ccc(O)cc1O +CPD-591,4,7,11,CPD-591,CPD-591_out_converted.sdf,[O-]c1cc2c([O-])cc(O)cc2[o+]c1-c1ccc(O)c(O)c1 +CPD-17791,4,5,9,CPD-17791,CPD-17791_out_converted.sdf,CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)O)O)O)O +CPD-6366,3,4,7,CPD-6366,CPD-6366_out_converted.sdf,COc1ccc2c(c1)OC(O)C(=O)N2[O-] +CNP0359851.1,3,3,6,CNP0359851.1,CNP0359851.1_out_converted.sdf,CC1(C)OC2=C(C[C@H]1O)C(O)=CC(O)=C2C(=O)/C=C/C1=CC=CC=C1 +CNP0360022.0,3,3,6,CNP0360022.0,CNP0360022.0_out_converted.sdf,COC1=CC(OCC=C(C)C)=C(OC)C(C2=COC3=CC(O)=CC(O)=C3C2=O)=C1 +CNP0288448.3,1,5,6,CNP0288448.3,CNP0288448.3_out_converted.sdf,O=C1C=C(/C=C/C2=CC=C(O)C=C2)[C@@H]2C(=O)[C@H]1[C@@H](C1=CC=C(O)C=C1)[C@@H]2C1=CC(O)=CC(O)=C1 +CNP0574550.1,3,3,6,CNP0574550.1,CNP0574550.1_out_converted.sdf,C=C1[O+]=C(OC2=C([O-])CC(OC3=C([O-])CC(/[O+]=C4\CC[C@@]5(C)[C@H](CC[C@@H]6[C@@H]5CC(=O)[C@]5(C)C(C7=C=C([O-])OC7)CC[C@]65[O-])C4)=[O+]C3=C)=[O+]C2=C)CC(=O)C1=O +CNP0076684.0,3,3,6,CNP0076684.0,CNP0076684.0_out_converted.sdf,COC1=C(O)C=CC=C1C1=C(O)C(=O)C2=CC(O)=CC(O)=C2O1 +CNP0124299.0,3,3,6,CNP0124299.0,CNP0124299.0_out_converted.sdf,O=C(C1=CN=C2C=CC=CC2=C1)C1=NC(=O)C2=CC=CC=C2N1 +CNP0105988.1,3,3,6,CNP0105988.1,CNP0105988.1_out_converted.sdf,COC1=CC(OC)=C([C@H]2COC3=CC(O)=CC(O)=C3C2=O)C=C1O +CNP0256385.0,1,4,5,CNP0256385.0,CNP0256385.0_out_converted.sdf,O=C1C=C(C2=CC=C(O)C=C2)OC2C=C(O)CC(O)=C12 +BUTEIN,2,3,5,BUTEIN,BUTEIN_out_converted.sdf,O=C(/C=C/c1ccc(O)c(O)c1)c1ccc(O)cc1O +CPD-12949,2,2,4,CPD-12949,CPD-12949_out_converted.sdf,C[C@]1(O)CC(=O)[C@@H]2c3cc4cccc(O)c4c(O)c3C(=O)C[C@]2(O)C1 +CNP0404986.0,2,2,4,CNP0404986.0,CNP0404986.0_out_converted.sdf,O=C(C1=CC=C2OCOC2=C1O)C1=NC=CC2=CC3=C(C=C12)OCO3 +CNP0426012.0,2,2,4,CNP0426012.0,CNP0426012.0_out_converted.sdf,COC1=CC(O)=C(O)C=C1C1=COC2=CC(O)=CC=C2C1=O +CNP0426334.0,2,2,4,CNP0426334.0,CNP0426334.0_out_converted.sdf,COC1=C(O)C=C2C(=O)C3=CC=C(O)C=C3OC2=C1OC +CNP0497639.1,3,1,4,CNP0497639.1,CNP0497639.1_out_converted.sdf,C[S@+]([O-])C1=C(CN2C3=CC=CC=C3C3=CC=NC=C32)NC=N1 +CNP0348231.1,2,2,4,CNP0348231.1,CNP0348231.1_out_converted.sdf,C=C(C)[C@@H](O)CC/C(C)=C/CC1=C(OC)C(O)=CC2=C1C(=O)C1=C(O)C=C(O)C=C1O2 +CNP0524070.0,2,2,4,CNP0524070.0,CNP0524070.0_out_converted.sdf,O=C1CC(=C2C=CC(O)C=C2)OC2=CC(O)=CC(O)=C12 +CNP0301960.1,2,2,4,CNP0301960.1,CNP0301960.1_out_converted.sdf,O=C1C2=CC(O)=CC=C2N=C2[C@@H](O)CCN21 +CPD-12724,2,2,4,CPD-12724,CPD-12724_out_converted.sdf,O=c1cc(-c2ccccc2)oc2cc(O)c(O)c(O)c12 +CPD-12726,2,2,4,CPD-12726,CPD-12726_out_converted.sdf,O=c1cc(-c2ccc(O)cc2)oc2cc(O)c(O)c(O)c12 +574-TRIHYDROXY-3-METHOXYFLAVONE,2,2,4,574-TRIHYDROXY-3-METHOXYFLAVONE,574-TRIHYDROXY-3-METHOXYFLAVONE_out_converted.sdf,COc1cc(-c2cc(=O)c3c(O)cc([O-])cc3o2)ccc1O +CPD-15721,2,2,4,CPD-15721,CPD-15721_out_converted.sdf,CCC1(CC(=O)C2=C(C3=C(C=C2C1C(=O)OC)C(=O)C4=C(C3=O)C(=CC=C4)O)O)O +CPD-17024,2,2,4,CPD-17024,CPD-17024_out_converted.sdf,C1=CC(=C(C(=C1)O)C(=O)C2=CC(=C(N2)Cl)Cl)O +CPD-17457,2,2,4,CPD-17457,CPD-17457_out_converted.sdf,C1C2C(C3(C=CC(=O)C=C3O1)O)OC4=CC5=C(C=C24)OCO5 +CNP0234857.5,2,2,4,CNP0234857.5,CNP0234857.5_out_converted.sdf,CC(=O)O[C@@H]1C=C[C@@]23C4=CC5=C(C=C4CN(C[C@H]2O)[C@H]3C1)OCO5 +CPD-19273,1,3,4,CPD-19273,CPD-19273_out_converted.sdf,CC1=C2C=CC=C(C2=C(C3=C1C=C4C=C(C(=C(C4=C3O)[O-])C(=O)N)O)O)O +CPD-21169,2,2,4,CPD-21169,CPD-21169_out_converted.sdf,C1=CC=C(C=C1)C2=NC(=CC(=N2)Cl)SCC(C(=O)[O-])[NH3+] +CPD-3622,2,2,4,CPD-3622,CPD-3622_out_converted.sdf,COc1ccc(-c2coc3cc([O-])cc(O)c3c2=O)cc1O +CPD-3629,2,2,4,CPD-3629,CPD-3629_out_converted.sdf,O=c1c(-c2cc3c(cc2O)OCO3)coc2cc(O)ccc12 +CPD-3630,2,2,4,CPD-3630,CPD-3630_out_converted.sdf,O=C1c2ccc(O)cc2OC[C@H]1c1cc2c(cc1O)OCO2 +CPD-431,2,2,4,CPD-431,CPD-431_out_converted.sdf,O=c1cc(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12 +CPD-7096,2,2,4,CPD-7096,CPD-7096_out_converted.sdf,[O-]c1cc(O)cc2[o+]c(-c3ccc(O)cc3)ccc12 +CPD-9539,2,2,4,CPD-9539,CPD-9539_out_converted.sdf,COc1ccc(-c2coc3cc(O)ccc3c2=O)cc1O +CPD1F-90,1,3,4,CPD1F-90,CPD1F-90_out_converted.sdf,O=c1c(O)c(-c2ccc(O)cc2)oc2cc([O-])cc(O)c12 +PHOTINUS-LUCIFERIN,2,2,4,PHOTINUS-LUCIFERIN,PHOTINUS-LUCIFERIN_out_converted.sdf,O=C([O-])[C@H]1CSC(c2nc3ccc(O)cc3s2)=N1 +CPD-17787,2,2,4,CPD-17787,CPD-17787_out_converted.sdf,CC1=CC(=CC(=C1C(=O)C2=C(C=C(C=C2O)OC)O)OC)O +CNP0352637.2,2,2,4,CNP0352637.2,CNP0352637.2_out_converted.sdf,COC1=CC(O)=CC([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)=C1 +VESTITONE,2,2,4,VESTITONE,VESTITONE_out_converted.sdf,COc1ccc([C@@H]2COc3cc(O)ccc3C2=O)c(O)c1 +C12135,2,2,4,C12135,C12135_out_converted.sdf,COC1=CC(=C(C=C1)C2=COC3=CC(=CC(=C3C2=O)O)O)O +CNP0076169.0,2,2,4,CNP0076169.0,CNP0076169.0_out_converted.sdf,CC1=C(O)C=C(O)C2=C1OC(C1=CC=C(O)C=C1)=CC2=O +CNP0078636.1,2,2,4,CNP0078636.1,CNP0078636.1_out_converted.sdf,COC1=C(O)C(O)=C2C(=O)[C@H](CC3=CC=C(O)C=C3)COC2=C1OC +CNP0080149.0,2,2,4,CNP0080149.0,CNP0080149.0_out_converted.sdf,COC1=C(O)C=C2C(=O)C3=C(CCCC3)C(=O)C2=C1O +CNP0106634.1,2,2,4,CNP0106634.1,CNP0106634.1_out_converted.sdf,CC1(C)C=CC2=C(O)C=CC([C@H]3CC(=O)C4=C(O)C=C(O)C=C4O3)=C2O1 +CNP0106817.1,2,2,4,CNP0106817.1,CNP0106817.1_out_converted.sdf,COC1=CC=C([C@@H](C)C(=O)C2=CC=C(OC)C=C2O)C=C1 +CNP0126704.1,2,2,4,CNP0126704.1,CNP0126704.1_out_converted.sdf,COC1=CC(C2=COC3=CC(O)=CC(O)=C3C2=O)=CC2=C1O[C@@H](C(C)(C)O)C2 +CNP0143640.1,2,2,4,CNP0143640.1,CNP0143640.1_out_converted.sdf,COC(=O)[C@H]1COC(C2=CC=CC=C2O)=N1 +CNP0145895.1,2,2,4,CNP0145895.1,CNP0145895.1_out_converted.sdf,COC1=C(C)C(O)=C(C(C)=O)C2=C1[C@]1(C)C(=O)C(C(C)=O)=C(O)C[C@]1(OC)O2 +CNP0172162.2,2,2,4,CNP0172162.2,CNP0172162.2_out_converted.sdf,COC1=C(C)C(=O)C2=C(C=C(O)C3=C2O[C@H](C)[C@]3(C)[C@H](O)C/C=C(/C)CO)C1=O +CNP0354771.1,1,2,3,CNP0354771.1,CNP0354771.1_out_converted.sdf,C[C@H](CC(=O)C[C@H](C)[C@@H]1C[C@H](O)[C@]2(C)C3=C(C(=O)C[C@@]12C)[C@]1(C)CCC(=O)C(C)(C)[C@@H]1CC3=O)C(=O)O +CNP0426121.1,1,2,3,CNP0426121.1,CNP0426121.1_out_converted.sdf,COC1=CC=C([C@@H]2CC(=O)C3=C(O)C=C(O)C(CC(O)=C(C)C)=C3O2)C=C1 +CNP0426105.1,1,2,3,CNP0426105.1,CNP0426105.1_out_converted.sdf,COC1=CC=C([C@@H]2CCC3=C(O)C=C(O)C=C3O2)C=C1 +CNP0401138.0,1,2,3,CNP0401138.0,CNP0401138.0_out_converted.sdf,O=S(=O)(O)C1=CC=C2C(=C1)OC1=CC=CC=C12 +CNP0425564.0,1,2,3,CNP0425564.0,CNP0425564.0_out_converted.sdf,COC(=O)C1=CC=C(OC)C(OC2=CC=C(C(=O)O)C=C2)=C1 +CNP0213003.1,1,2,3,CNP0213003.1,CNP0213003.1_out_converted.sdf,CN1C(=N)N(C)[C@@H](CC2=CNC3=CC=CC=C23)C1=O +CNP0279015.1,1,2,3,CNP0279015.1,CNP0279015.1_out_converted.sdf,C[C@@H]1CC[C@H]2C(C)(C)CCC[C@]2(C)/C1=C/C1=CC(O)=C(O)C(/C=N/CCS(=O)(=O)O)=C1OS(=O)(=O)O +CNP0300571.1,1,1,2,CNP0300571.1,CNP0300571.1_out_converted.sdf,CO[C@@H]1C[C@@H](C2=CC=C(O)C=C2)OC2=CC(O)=CC=C21 +CPD-15172,1,1,2,CPD-15172,CPD-15172_out_converted.sdf,C1=CC=C(C=C1)C2=CC(=O)C3=C(O2)C=C(C(=O)C3=O)O +CNP0138834.1,1,1,2,CNP0138834.1,CNP0138834.1_out_converted.sdf,CC(C)=CCC1=CC([C@@H]2COC3=CC(O)=CC(O)=C3C2=O)=CC(O)=C1O +CNP0139950.1,1,1,2,CNP0139950.1,CNP0139950.1_out_converted.sdf,C=C(C)[C@@H](O)CC1=CC(C2=CC(=O)C3=C(O)C=C(O)C=C3O2)=CC2=C1OC(C)(C)C=C2 +CPD-17072,1,1,2,CPD-17072,CPD-17072_out_converted.sdf,CC(=O)C1=C(C(NC1=O)CC2=CNC3=CC=CC=C32)[O-] +CNP0142417.1,1,1,2,CNP0142417.1,CNP0142417.1_out_converted.sdf,C=C(C)[C@@H]1CC2=C(O)C=CC(C3=COC4=CC(O)=CC(O)=C4C3=O)=C2O1 +CPD-15777,1,1,2,CPD-15777,CPD-15777_out_converted.sdf,CC1C(=CC=CN1CC2=CN=C(N=C2N)C)CCO +CNP0142712.2,1,1,2,CNP0142712.2,CNP0142712.2_out_converted.sdf,COC1=CC2=C(C=C1OC)[C@@H](C(=O)C1=CC=C3OC=CC3=C1O)CCO2 +CPD-15265,1,1,2,CPD-15265,CPD-15265_out_converted.sdf,C1CC2C(=O)NC(C(=O)N2C1)CC3=CNC4=CC=CC=C43 +CPD-14164,1,1,2,CPD-14164,CPD-14164_out_converted.sdf,C1=CC(=C(C(=C1)O)O)C2=C(C(=CC=C2)O)O +CNP0135275.1,1,1,2,CNP0135275.1,CNP0135275.1_out_converted.sdf,C=C1CC[C@@H](O)[C@]2(C)CC[C@@H]([C@@H](C)C(=O)O)C[C@H]12 +CNP0260962.2,1,1,2,CNP0260962.2,CNP0260962.2_out_converted.sdf,CC1=CC(C)(C)C(C=O)=C[C@@H]1OC(=O)/C=C(\C)CO +CNP0273031.1,1,1,2,CNP0273031.1,CNP0273031.1_out_converted.sdf,C[C@@H]1CC[C@]2(C)[C@H](CO)CCC[C@@H]2[C@@]1(C)CC/C(=C/CO)CO +CNP0146656.1,1,1,2,CNP0146656.1,CNP0146656.1_out_converted.sdf,C[C@@H]1C(=O)O[C@H]2[C@H]1[C@@H](O)C[C@@]1(C)C(=O)C=C[C@](C)(O)[C@H]21 +CPD-12022,1,1,2,CPD-12022,CPD-12022_out_converted.sdf,COc1ccc(NC=O)c(C(=O)CCNC(C)=O)c1 +CPD-11556,1,1,2,CPD-11556,CPD-11556_out_converted.sdf,Cc1cc(O)cc2c1C(=O)CC(O)(Cc1cc([O-])cc(=O)o1)O2 +CPD-11553,1,1,2,CPD-11553,CPD-11553_out_converted.sdf,Cc1cc(O)cc(O)c1-c1cc([O-])cc(=O)o1 +CPD-10176,1,1,2,CPD-10176,CPD-10176_out_converted.sdf,O=C1c2cc([O-])cc(O)c2C(=O)c2c1cc1c(c2O)[C@@H]2C=CO[C@@H]2O1 +CNP0136916.1,1,1,2,CNP0136916.1,CNP0136916.1_out_converted.sdf,COC1=CC(C)=C(C(=O)O[C@@H]2C[C@]3(C)[C@H]4[C@@H](O)C(C)(C)C[C@@H]4C=C(CO)[C@]23O)C(O)=C1 +CPD-19294,1,1,2,CPD-19294,CPD-19294_out_converted.sdf,C1C(OC2=C(C1=O)C=CC(=C2)O)C3=CC(=C(C=C3)O)O +CNP0148342.3,1,1,2,CNP0148342.3,CNP0148342.3_out_converted.sdf,COC1=CC=C2C(=C1)OC[C@@]1(O)C3=CC4=C(C=C3O[C@@H]21)OCO4 +CNP0260370.1,1,1,2,CNP0260370.1,CNP0260370.1_out_converted.sdf,CC1=C(C)C2=CC(=O)[C@](C)(O)[C@H](O)[C@@H]2CO1 +CNP0232261.2,1,1,2,CNP0232261.2,CNP0232261.2_out_converted.sdf,COC1=CC=C([C@H]2COC3=CC(O)=CC=C3C2)C(O)=C1O +C16405,1,1,2,C16405,C16405_out_converted.sdf,COC1=C(C=CC(=C1)C=CC(=O)C2=C(C=C(C=C2O)O)O)O +DCPIP,1,1,2,DCPIP,DCPIP_out_converted.sdf,O=C1C(Cl)=CC(=Nc2ccc(O)cc2)C=C1Cl +C19588,1,1,2,C19588,C19588_out_converted.sdf,COC1=C2C3=C(C(=O)CC3)C(=O)OC2=C4C5C(C(OC5OC4=C1)O)O +CPD-9557,1,1,2,CPD-9557,CPD-9557_out_converted.sdf,Cc1cc(O)c2c(c1)Cc1cc(O)cc(O)c1C2=O +CNP0216219.17,1,1,2,CNP0216219.17,CNP0216219.17_out_converted.sdf,CO[C@H]1C=C[C@@]23C4=CC5=C(C=C4[C@H](O)[N@@](C[C@@H]2O)[C@H]3C1)OCO5 +CPD-8215,1,1,2,CPD-8215,CPD-8215_out_converted.sdf,Cc1cc(O)c2c(c1)C(=O)c1cc([O-])cc(O)c1C2=O +CNP0243499.1,1,1,2,CNP0243499.1,CNP0243499.1_out_converted.sdf,COC(=O)/C(C)=C/[C@@H](O)C1=CC2=CC=C(OC)C(O)=C2OC1=O +CPD-7027,1,1,2,CPD-7027,CPD-7027_out_converted.sdf,COc1cc2c(=O)c(-c3ccc(O)cc3)coc2cc1O +CPD-6955,1,1,2,CPD-6955,CPD-6955_out_converted.sdf,O=C(/C=C/c1ccc(O)cc1)Cc1cc([O-])cc(=O)o1 +CPD-693,1,1,2,CPD-693,CPD-693_out_converted.sdf,CC1=CC(=O)CC(C)(C)[C@@]1(O)/C=C/C(C)=C\C(=O)[O-] +CNP0248017.2,1,1,2,CNP0248017.2,CNP0248017.2_out_converted.sdf,CCC(=O)NCC[C@@]1(O)C2=C(N=CS2)C2=NC=CC3=C4C=CC=CC4=NC1=C23 +CNP0090058.1,1,1,2,CNP0090058.1,CNP0090058.1_out_converted.sdf,CCOC(=O)C1=C(C=O)C2=C3CCC(=O)[C@H](C3)C3=CC4=C(C=C3CC[C@@H](CO)COC3=C2C(=C(CO)C2=C3C[C@@H]([C@](C)(O)CCCO)O2)O1)NC=C4 +CPD-498,1,1,2,CPD-498,CPD-498_out_converted.sdf,C=C1C[C@]23C[C@H]1CC[C@@H]2C1=CC(=O)C[C@@](C)(C(=O)[O-])[C@H]1[C@@H]3C(=O)[O-] +CNP0251052.1,1,1,2,CNP0251052.1,CNP0251052.1_out_converted.sdf,C[C@@H](CO)[C@@H](C)C[C@@H](O)[C@](C)(O)[C@H]1CC[C@@]2(O)C3=CC(=O)[C@]4(O)C[C@@H](O)[C@@H](O)C[C@]4(C)[C@H]3CC[C@]12C +CNP0120483.1,1,1,2,CNP0120483.1,CNP0120483.1_out_converted.sdf,C=C(C)C1=CC2=C3O[C@H]4C5=CC=C(O)C=C5OC[C@@]4(O)C3=CC=C2O1 +CNP0122740.1,1,1,2,CNP0122740.1,CNP0122740.1_out_converted.sdf,C=C1CC[C@@H]2[C@@](C)(CO)C[C@@H](O)C[C@]2(C)[C@@H]1CCC1=CC(=O)OC1 +CNP0147378.1,1,1,2,CNP0147378.1,CNP0147378.1_out_converted.sdf,CC(=O)OC[C@]12O[C@]34C=C(C)C(=O)C[C@]3(C)[C@@]1(C)[C@H](O)C[C@H]2O4 +CNP0153223.1,1,1,2,CNP0153223.1,CNP0153223.1_out_converted.sdf,CCC[C@H]1OCC2=C([C@@H](O)[C@@H]3O[C@@H]3C2=O)[C@@H]1O +CNP0210135.1,1,1,2,CNP0210135.1,CNP0210135.1_out_converted.sdf,COC1=CC=C2C(=C1)O[C@@H]1C3=CC(OC)=C(O)C=C3OC[C@H]21 +CNP0426805.1,1,1,2,CNP0426805.1,CNP0426805.1_out_converted.sdf,CC[C@@H]1CN2CCC3=C(NC4=CC=CC=C34)C2=C/C1=C(\C=O)C(=O)OC +CNP0191964.1,1,1,2,CNP0191964.1,CNP0191964.1_out_converted.sdf,C/C=C(\C)[C@H]1OC2=C(C(=O)NC=C2C2=CC=CC=C2)[C@]1(C)CO +CNP0199889.5,1,1,2,CNP0199889.5,CNP0199889.5_out_converted.sdf,C=C1C(=O)O[C@@H]2CC(C)=C([C@@H](C)CCCOC(C)=O)[C@H](O)[C@@H]12 +CNP0359684.0,1,1,2,CNP0359684.0,CNP0359684.0_out_converted.sdf,COC1=CC2=C(OC3=CC=C(O)C=C3C2=O)C(OC)=C1OC +CNP0292276.1,1,1,2,CNP0292276.1,CNP0292276.1_out_converted.sdf,C[C@H](C(=O)O)[C@H]1CCC2=CC(=O)C[C@H](C)[C@@]2(C)C1 +CNP0353297.0,1,1,2,CNP0353297.0,CNP0353297.0_out_converted.sdf,COC1=CC2=C(C=C1O)C(C=O)=C(C1=CC=C(O)C=C1O)O2 +CNP0204494.2,1,1,2,CNP0204494.2,CNP0204494.2_out_converted.sdf,COC1=CC(O)=CC2=C1C(=O)[C@@H](C1=CC=C3OC(C)(C)C=CC3=C1O)CO2 +CNP0352117.1,1,1,2,CNP0352117.1,CNP0352117.1_out_converted.sdf,CC[C@H](C)C[C@@]1(C)C=C/C(=C2/C(=O)O[C@H](CC(=O)O)C2=O)O1 +CNP0204979.2,1,1,2,CNP0204979.2,CNP0204979.2_out_converted.sdf,CC(C)=CCC1=C([C@H]2CC(=O)C3=C(O)C=C(O)C=C3O2)C=CC(O)=C1O +CNP0342898.3,1,1,2,CNP0342898.3,CNP0342898.3_out_converted.sdf,COC1=CC(O)=CC([C@H](/C=C\C2=CC=C(O)C=C2)[C@@H](O)CO)=C1 +CNP0329144.1,1,1,2,CNP0329144.1,CNP0329144.1_out_converted.sdf,C=C(C)[C@@H](O)CC1=C(OC)C(=O)C2=CC(OC)=C(O)C(O)=C2C1=O +CNP0328219.1,1,1,2,CNP0328219.1,CNP0328219.1_out_converted.sdf,C=C1[C@@H](O)C[C@H]2C(C)(C)C[C@H](Cl)C[C@@]2(C)[C@H]1C[C@H](O)[C@H]1CC(=O)NC1=O +CNP0320138.1,1,1,2,CNP0320138.1,CNP0320138.1_out_converted.sdf,C=C(C)[C@@H](CC1=C2OC(C)(C)C=CC2=C(O)C2=C1OC1=CC=C(O)C=C1C2=O)OO +CNP0318279.1,1,1,2,CNP0318279.1,CNP0318279.1_out_converted.sdf,COC(=O)N1C=C([C@](OC)(C2=CNC3=CC(Br)=CC=C23)S(=O)(=O)O)C2=CC=C(Br)C=C21 +CNP0317733.1,1,1,2,CNP0317733.1,CNP0317733.1_out_converted.sdf,C=C1C(=O)O[C@H](C2=COC=C2)C[C@@H]1[C@@]1(C)CC=C[C@@]2(C)COC(=O)[C@H]12 +CNP0308290.1,1,1,2,CNP0308290.1,CNP0308290.1_out_converted.sdf,C[C@@H]1CC(=O)[C@]23CO[C@@H](O)[C@@H]2CC(=O)C[C@@H]3[C@@]12C[C@@H](C1=COC=C1)OC2=O +CNP0307229.6,1,1,2,CNP0307229.6,CNP0307229.6_out_converted.sdf,O=C(OC1C[C@H]2CC[C@@H](C1)N2)[C@@H](CO)C1=CC=CC=C1 +CNP0302706.2,1,1,2,CNP0302706.2,CNP0302706.2_out_converted.sdf,C[C@H]1C(=O)C2=C(O)C=C(O)C=C2O[C@H]1C1=CC=C(O)C=C1 +CNP0363021.1,1,1,2,CNP0363021.1,CNP0363021.1_out_converted.sdf,O[C@@]12COC3=CC4=C(C=CO4)C=C3[C@@H]1OC1=CC3=C(C=C12)OCO3 +CNP0365331.1,1,1,2,CNP0365331.1,CNP0365331.1_out_converted.sdf,O=C1C[C@@H](C2=CC=C3OCC=CC3=C2)OC2=CC(O)=CC=C12 +CNP0399207.2,1,1,2,CNP0399207.2,CNP0399207.2_out_converted.sdf,O=C1C[C@H]2O[C@H]([C@H](O)C3=CC=CC=C3)[C@@H](O)[C@@H]2O1 +CNP0425372.1,1,1,2,CNP0425372.1,CNP0425372.1_out_converted.sdf,CC[C@H](C)[C@H](C1=C(O)C(C)(C)C(=O)C(C)(C)C1=O)C1=C(O)C(C)=C(O)C(C(=O)CCC2=CC=CC=C2)=C1O +CNP0426639.0,1,1,2,CNP0426639.0,CNP0426639.0_out_converted.sdf,O=C1C2=C(COC3=CC=CC(O)=C32)OC2=CC3=C(OCO3)C(O)=C12 +CNP0165737.1,1,1,2,CNP0165737.1,CNP0165737.1_out_converted.sdf,C/C=C(\C=C(C)\C=C\CC/C=C(\C)C(=O)[C@]12O[C@H]1[C@@](C)(O)NC2=O)C(=O)OC +CNP0426155.1,1,1,2,CNP0426155.1,CNP0426155.1_out_converted.sdf,CC[C@H](C)[C@@H]1NCC[C@]12C(=O)NC1=CC(O)=CC=C12 +CNP0276575.3,1,1,2,CNP0276575.3,CNP0276575.3_out_converted.sdf,O=C1C=CC[C@@H](/C=C/C[C@H](O)C[C@@H](O)/C=C/C2=CC=CC=C2)O1 +CNP0286670.1,1,1,2,CNP0286670.1,CNP0286670.1_out_converted.sdf,C=CC(C)(C)C1=CC([C@H]2COC3=CC(O)=CC=C3C2)=C(O)C(O)=C1OC +CNP0228022.1,1,1,2,CNP0228022.1,CNP0228022.1_out_converted.sdf,CC1=CC=C(C(=O)O[C@@H]2C=C3CCN(C)[C@H]3[C@H](C3=CC4=C(C=C3C(=O)O)OCO4)C2)C(C)=N1 +CNP0425515.0,1,1,2,CNP0425515.0,CNP0425515.0_out_converted.sdf,COC(=O)C1=C(C2=CNC3=CC(Cl)=C(Cl)C=C23)C(C2=CNC3=CC=C(Cl)C=C23)=CN1 +CNP0425203.1,1,1,2,CNP0425203.1,CNP0425203.1_out_converted.sdf,CC1(C)CCCC(=O)[C@H]1CCC1=C[C@@H](O)CC1=O +CNP0212403.1,1,1,2,CNP0212403.1,CNP0212403.1_out_converted.sdf,COC1=C([C@]2(O)COC3=CC(O)=CC(O)=C3C2=O)C=CC(O)=C1CC=C(C)C +CNP0424844.1,1,1,2,CNP0424844.1,CNP0424844.1_out_converted.sdf,COC1=CC2=C(C(O)=C1C1=CC(=O)CC1)[C@@H]1C=CO[C@@H]1O2 +CNP0424373.0,1,1,2,CNP0424373.0,CNP0424373.0_out_converted.sdf,CN1C2=CC3=C(C=C2C(=O)C2=C1OC=C2)OCO3 +CNP0424274.0,1,1,2,CNP0424274.0,CNP0424274.0_out_converted.sdf,COC1=CC(O)=C(C2=COC3=CC(O)=CC(O)=C3C2=O)C=C1CC=C(C)C +CNP0424223.0,1,1,2,CNP0424223.0,CNP0424223.0_out_converted.sdf,OC1=CC=C2C(C3=C(O)C=CC4=CC(O)=CC=C34)=C(O)C=CC2=C1 +CNP0412522.2,1,1,2,CNP0412522.2,CNP0412522.2_out_converted.sdf,COC1=CC2=C(C=C1[C@H]1COC3=CC(O)=CC=C3C1)OCO2 +CNP0406736.1,1,1,2,CNP0406736.1,CNP0406736.1_out_converted.sdf,C[C@H](O)/C=C1\C[C@H](O)[C@@]23C[C@@H]2C(C)(C)O[C@@]3(O)C1=O +CNP0179144.1,1,1,2,CNP0179144.1,CNP0179144.1_out_converted.sdf,COC1=CC2=C(CN3C[C@@H](O)[C@]24CC[C@H](OC)C[C@@H]34)C(OC)=C1O +CNP0226086.2,1,1,2,CNP0226086.2,CNP0226086.2_out_converted.sdf,C=C1C(=O)O[C@@H](/C=C(\C)CO)[C@H]1[C@@H](C/C(C)=C/CO)OC(=O)/C(C)=C\C diff --git a/src/chemplot_select_and_mark.py b/src/chemplot_select_and_mark.py new file mode 100644 index 0000000..87cfed9 --- /dev/null +++ b/src/chemplot_select_and_mark.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +兼容你这版 ChemPlot (from_smiles(sim_type=...), .umap(), .cluster(), .interactive_plot()) +流程: +1) 原始数据 -> UMAP+KMeans -> 选 Top-30 -> 导出 selected_top30.csv + + 交互图 chemplot_interactive_original.html(用原始 Total_Count 着色) +2) 将 Top-30 的 Total_Count=100 -> 交互图 chemplot_interactive_marked.html(突出显示) +""" + +import argparse +from pathlib import Path +import numpy as np +import pandas as pd +from chemplot import Plotter +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt + +# --- 小工具 --- + +def guess_smiles_col(df: pd.DataFrame): + for c in df.columns: + if c.lower() in ("smiles", "smile", "canonical_smiles"): + return c + # 简单启发式兜底 + import re + pat = re.compile(r"^[A-Za-z0-9@+\-\[\]\(\)=#\\/]+$") + best, r = None, -1 + for c in df.columns: + vals = df[c].dropna().astype(str).head(50).tolist() + ok = [bool(pat.match(s)) and any(ch in s for ch in "CNOH[]()") for s in vals] + ratio = float(np.mean(ok)) if ok else 0.0 + if ratio > r: + best, r = c, ratio + return best + +def farthest_point_sampling(X: np.ndarray, k: int, seed: int = 42, init: int | None = None): + n = X.shape[0] + if n == 0: return [] + if init is None: + init = int(np.argmax(np.linalg.norm(X - X.mean(axis=0), axis=1))) + sel = [init] + dmin = np.linalg.norm(X - X[init], axis=1) + for _ in range(1, min(k, n)): + j = int(np.argmax(dmin)) + sel.append(j) + dmin = np.minimum(dmin, np.linalg.norm(X - X[j], axis=1)) + return sel + +def kmeans_then_diverse(coords: np.ndarray, n_clusters: int, topk: int, seed: int = 42): + km = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto") + labels = km.fit_predict(coords) + # 每簇取“中心最近”样本 + picks = [] + for c in range(n_clusters): + idx = np.where(labels == c)[0] + if idx.size == 0: continue + center = km.cluster_centers_[c] + j = idx[np.argmin(np.linalg.norm(coords[idx] - center, axis=1))] + picks.append(j) + # 用 FPS 补/裁成 topk,保证多样性 + if len(picks) < topk: + return labels, farthest_point_sampling(coords, topk, seed=seed) + if len(picks) > topk: + sub = coords[picks] + order = farthest_point_sampling(sub, topk, seed=seed) + return labels, [picks[i] for i in order] + return labels, picks + +def static_preview(coords: np.ndarray, selected: list[int], out_png: Path, title: str): + import matplotlib.pyplot as plt + plt.figure(figsize=(7,6)) + plt.scatter(coords[:,0], coords[:,1], s=10, alpha=0.5) + if selected: + sel = np.array(selected) + plt.scatter(coords[sel,0], coords[sel,1], s=40, marker='x') + plt.title(title); plt.xlabel("Dim-1"); plt.ylabel("Dim-2") + plt.tight_layout(); plt.savefig(out_png, dpi=200); plt.close() + +# --- 主流程 --- + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--csv", default="data/title_intersection_analysis.csv") + ap.add_argument("--outdir", default="chemplot_output") + ap.add_argument("--n-clusters", type=int, default=30) + ap.add_argument("--topk", type=int, default=30) + ap.add_argument("--neighbors", type=int, default=None) # 传给 umap() + ap.add_argument("--min-dist", type=float, default=None) # 传给 umap() + ap.add_argument("--seed", type=int, default=42) + args = ap.parse_args() + + outdir = Path(args.outdir); outdir.mkdir(parents=True, exist_ok=True) + + df = pd.read_csv(args.csv) + smiles_col = guess_smiles_col(df) + if not smiles_col: + raise RuntimeError("无法识别 SMILES 列;请用 --smiles-col 指定。") + + # ------- 第一次:原始数据 -> UMAP -> KMeans -> Top-30 ------- + # 用原始 Total_Count 作为 target 做颜色(连续型 => target_type='R') + target_orig = df["Total_Count"].tolist() if "Total_Count" in df.columns else [0]*len(df) + + plotter = Plotter.from_smiles( + df[smiles_col].tolist(), + target=target_orig, + target_type="R", + sim_type="structural", + ) + + # UMAP 降维 + # 你这版API:plotter.umap(n_neighbors=?, min_dist=?, pca=False, random_state=?) + emb = plotter.umap( + n_neighbors=args.neighbors, + min_dist=args.min_dist, + pca=False, + random_state=args.seed + ) + coords = emb.iloc[:, :2].to_numpy().copy() + + # KMeans + 多样性 Top-30 + labels, selected_idx = kmeans_then_diverse(coords, n_clusters=args.n_clusters, topk=args.topk, seed=args.seed) + + # 保存 selected_top30.csv(带上原 DataFrame 的字段) + df_sel = df.iloc[selected_idx].copy() + df_sel.to_csv(outdir / "selected_top30.csv", index=False) + + # 保存 embedding_with_labels.csv(便于后续分析) + df_emb = df.copy() + df_emb["x"] = coords[:,0]; df_emb["y"] = coords[:,1] + df_emb["cluster"] = labels + df_emb["Selected"] = 0 + df_emb.loc[df_emb.index.isin(selected_idx), "Selected"] = 1 + df_emb.to_csv(outdir / "embedding_with_labels.csv", index=False) + + # 交互 HTML(原始 Total_Count) + plotter.cluster(n_clusters=args.n_clusters, random_state=args.seed) + plotter.interactive_plot( + size=900, + kind="scatter", + remove_outliers=False, + is_colored=True, # 用 target (Total_Count) 上色 + clusters=True, # 附带 clusters 标签页 + filename=str(outdir / "chemplot_interactive_original.html"), + show_plot=False, + title="UMAP + KMeans (original Total_Count)" + ) + + # 静态预览图 + static_preview(coords, selected_idx, outdir / "scatter_preview.png", "UMAP + KMeans + Top-30") + + # ------- 第二次:把 Top-30 的 Total_Count 设为 100 -> 交互 HTML ------- + target_marked = df["Total_Count"].tolist() if "Total_Count" in df.columns else [0]*len(df) + for i in selected_idx: + target_marked[i] = 100 + + plotter2 = Plotter.from_smiles( + df[smiles_col].tolist(), + target=target_marked, + target_type="R", + sim_type="structural", + ) + # 为了与第一次布局一致,仍旧跑一次 UMAP(random_state固定,布局稳定) + plotter2.umap( + n_neighbors=args.neighbors, + min_dist=args.min_dist, + pca=False, + random_state=args.seed + ) + plotter2.cluster(n_clusters=args.n_clusters, random_state=args.seed) + plotter2.interactive_plot( + size=900, + kind="scatter", + remove_outliers=False, + is_colored=True, + clusters=True, + filename=str(outdir / "chemplot_interactive_marked.html"), + show_plot=False, + title="UMAP + KMeans (Top-30 set Total_Count=100)" + ) + + print("Done. Outputs:", outdir.resolve()) + +if __name__ == "__main__": + main()