From 0d1d542919b38cf56a7c8ec82c6837a1d462d56d Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Wed, 20 Aug 2025 22:01:46 +0800 Subject: [PATCH] =?UTF-8?q?=E8=BF=87=E6=BB=A4=E5=9C=A8=20karamadock=20?= =?UTF-8?q?=E4=B8=8E=20AutoDock=20vina=20=E4=B8=AD=E8=81=9A=E7=B1=BB?= =?UTF-8?q?=E5=90=8E=E5=89=8D=201000=20=E5=88=86=E5=AD=90=E5=9C=A8=20glide?= =?UTF-8?q?=20=E5=AF=B9=E6=8E=A5=E7=9A=84=E4=BA=A4=E9=9B=86=E7=BB=93?= =?UTF-8?q?=E6=9E=9C.=EF=BC=88trpe=20=E4=B8=8E=20fgbar=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../jupyter/filter_docking_and_dedupe.ipynb | 154 ++++++++++++++++++ scripts/merge_csv.py | 23 --- 2 files changed, 154 insertions(+), 23 deletions(-) create mode 100644 scripts/jupyter/filter_docking_and_dedupe.ipynb delete mode 100644 scripts/merge_csv.py diff --git a/scripts/jupyter/filter_docking_and_dedupe.ipynb b/scripts/jupyter/filter_docking_and_dedupe.ipynb new file mode 100644 index 0000000..81765bd --- /dev/null +++ b/scripts/jupyter/filter_docking_and_dedupe.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 过滤与降重: docking score < -5.210,并分别按 Entry Name 统计 (karma 和 vina)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd", + "from pathlib import Path", + "import re", + "", + "# 配置", + "threshold = -5.210", + "", + "# 寻找仓库根目录 (包含 result 与 scripts 的文件夹)", + "def find_repo_root(start: Path) -> Path:", + " cur = start.resolve()", + " for _ in range(5):", + " if (cur / 'result').exists() and (cur / 'scripts').exists():", + " return cur", + " if cur.parent == cur:", + " break", + " cur = cur.parent", + " return start.resolve()", + "", + "root = find_repo_root(Path.cwd())", + "base_dir = root / 'result' / 'glide'", + "karma_path = base_dir / 'trpe_karma_score_aligned_top1000.csv'", + "vina_path = base_dir / 'trpe_vina_score_aligned_top1000.csv'", + "", + "# 读取", + "df_karma = pd.read_csv(karma_path)", + "df_vina = pd.read_csv(vina_path)", + "", + "# 过滤 docking score < -5.210 的行", + "filtered_karma = df_karma[df_karma['docking score'] < threshold].copy()", + "filtered_vina = df_vina[df_vina['docking score'] < threshold].copy()", + "", + "# 规范化 Entry Name(去掉末尾的 .数字 版本后缀,如 .1 或 .1.1)", + "def normalize_entry(s: str) -> str:", + " if not isinstance(s, str):", + " return s", + " return re.sub(r'(?:\\.\\d+)+$', '', s)", + "", + "for df in (filtered_karma, filtered_vina):", + " df['Entry Name Base'] = df['Entry Name'].astype(str).map(normalize_entry)", + "", + "# 简要查看过滤结果规模", + "print(f\"karma: {len(filtered_karma)} / {len(df_karma)} 保留 (threshold = {threshold})\")", + "print(f\"vina : {len(filtered_vina)} / {len(df_vina)} 保留 (threshold = {threshold})\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# karma: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)", + "entry_counts_karma = (", + " filtered_karma.groupby('Entry Name Base')", + " .size()", + " .rename('count')", + " .reset_index()", + " .sort_values('count', ascending=False)", + ")", + "best_per_entry_karma = (", + " filtered_karma.sort_values('docking score', ascending=True)", + " .groupby('Entry Name Base', as_index=False)", + " .first()", + ")", + "dedup_with_counts_karma = best_per_entry_karma.merge(entry_counts_karma, on='Entry Name Base', how='left')", + "", + "print(f\"karma: {len(filtered_karma)} 条,Entry Name 去重后: {len(dedup_with_counts_karma)}\")", + "entry_counts_karma.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# vina: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)", + "entry_counts_vina = (", + " filtered_vina.groupby('Entry Name Base')", + " .size()", + " .rename('count')", + " .reset_index()", + " .sort_values('count', ascending=False)", + ")", + "best_per_entry_vina = (", + " filtered_vina.sort_values('docking score', ascending=True)", + " .groupby('Entry Name Base', as_index=False)", + " .first()", + ")", + "dedup_with_counts_vina = best_per_entry_vina.merge(entry_counts_vina, on='Entry Name Base', how='left')", + "", + "print(f\"vina : {len(filtered_vina)} 条,Entry Name 去重后: {len(dedup_with_counts_vina)}\")", + "entry_counts_vina.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# karma 与 vina 的 Entry Name 交集 (并展示各自 count)", + "karma_names = set(entry_counts_karma['Entry Name Base'])", + "vina_names = set(entry_counts_vina['Entry Name Base'])", + "common_names = sorted(karma_names & vina_names)", + "", + "common_df = (", + " pd.DataFrame({'Entry Name Base': common_names})", + " .merge(entry_counts_karma.rename(columns={'count': 'karma_count'}), on='Entry Name Base', how='left')", + " .merge(entry_counts_vina.rename(columns={'count': 'vina_count'}), on='Entry Name Base', how='left')", + " .sort_values(['karma_count', 'vina_count', 'Entry Name Base'], ascending=[False, False, True])", + ")", + "print(f\"交集个数: {len(common_df)}\")", + "common_df.head(20)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "vina", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/merge_csv.py b/scripts/merge_csv.py deleted file mode 100644 index 9866814..0000000 --- a/scripts/merge_csv.py +++ /dev/null @@ -1,23 +0,0 @@ -# 合并 COCOUNT 与 plant 的对接结果。 -from pathlib import Path -import pandas as pd - -data = { - "plant": ["qed_values_poses_trpe_converted.csv", "qed_values_poses_fgbar_converted.csv"], - "cocount": ["qed_values_trpe.csv", "qed_values_fgbar.csv"] -} - -df_trpe_plant1 = pd.read_csv(data["plant"][0]) -df_fgbar_plant2 = pd.read_csv(data["plant"][1]) -df_trpe_plant1["source"] = "plant" -df_fgbar_plant2["source"] = "plant" -df_trpe_cocount1 = pd.read_csv(data["cocount"][0]) -df_fgbar_cocount2 = pd.read_csv(data["cocount"][1]) -df_trpe_cocount1["source"] = "cocount" -df_fgbar_cocount2["source"] = "cocount" - -df_trpe = pd.concat([df_trpe_plant1, df_trpe_cocount1]) -df_fgbar = pd.concat([df_fgbar_plant2, df_fgbar_cocount2]) - -df_trpe.to_csv("qed_values_poses_trpe_all.csv", index=False) -df_fgbar.to_csv("qed_values_poses_fgbar_all.csv", index=False) # 41166 条数据 \ No newline at end of file