From 0d1d542919b38cf56a7c8ec82c6837a1d462d56d Mon Sep 17 00:00:00 2001
From: lingyuzeng <pylyzeng@gmail.com>
Date: Wed, 20 Aug 2025 22:01:46 +0800
Subject: [PATCH] =?UTF-8?q?=E8=BF=87=E6=BB=A4=E5=9C=A8=20karamadock=20?=
 =?UTF-8?q?=E4=B8=8E=20AutoDock=20vina=20=E4=B8=AD=E8=81=9A=E7=B1=BB?=
 =?UTF-8?q?=E5=90=8E=E5=89=8D=201000=20=E5=88=86=E5=AD=90=E5=9C=A8=20glide?=
 =?UTF-8?q?=20=E5=AF=B9=E6=8E=A5=E7=9A=84=E4=BA=A4=E9=9B=86=E7=BB=93?=
 =?UTF-8?q?=E6=9E=9C.=EF=BC=88trpe=20=E4=B8=8E=20fgbar=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../jupyter/filter_docking_and_dedupe.ipynb   | 154 ++++++++++++++++++
 scripts/merge_csv.py                          |  23 ---
 2 files changed, 154 insertions(+), 23 deletions(-)
 create mode 100644 scripts/jupyter/filter_docking_and_dedupe.ipynb
 delete mode 100644 scripts/merge_csv.py

diff --git a/scripts/jupyter/filter_docking_and_dedupe.ipynb b/scripts/jupyter/filter_docking_and_dedupe.ipynb
new file mode 100644
index 0000000..81765bd
--- /dev/null
+++ b/scripts/jupyter/filter_docking_and_dedupe.ipynb
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 过滤与降重: docking score < -5.210，并分别按 Entry Name 统计 (karma 和 vina)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd",
+    "from pathlib import Path",
+    "import re",
+    "",
+    "# 配置",
+    "threshold = -5.210",
+    "",
+    "# 寻找仓库根目录 (包含 result 与 scripts 的文件夹)",
+    "def find_repo_root(start: Path) -> Path:",
+    "    cur = start.resolve()",
+    "    for _ in range(5):",
+    "        if (cur / 'result').exists() and (cur / 'scripts').exists():",
+    "            return cur",
+    "        if cur.parent == cur:",
+    "            break",
+    "        cur = cur.parent",
+    "    return start.resolve()",
+    "",
+    "root = find_repo_root(Path.cwd())",
+    "base_dir = root / 'result' / 'glide'",
+    "karma_path = base_dir / 'trpe_karma_score_aligned_top1000.csv'",
+    "vina_path = base_dir / 'trpe_vina_score_aligned_top1000.csv'",
+    "",
+    "# 读取",
+    "df_karma = pd.read_csv(karma_path)",
+    "df_vina = pd.read_csv(vina_path)",
+    "",
+    "# 过滤 docking score < -5.210 的行",
+    "filtered_karma = df_karma[df_karma['docking score'] < threshold].copy()",
+    "filtered_vina = df_vina[df_vina['docking score'] < threshold].copy()",
+    "",
+    "# 规范化 Entry Name（去掉末尾的 .数字 版本后缀，如 .1 或 .1.1）",
+    "def normalize_entry(s: str) -> str:",
+    "    if not isinstance(s, str):",
+    "        return s",
+    "    return re.sub(r'(?:\\.\\d+)+$', '', s)",
+    "",
+    "for df in (filtered_karma, filtered_vina):",
+    "    df['Entry Name Base'] = df['Entry Name'].astype(str).map(normalize_entry)",
+    "",
+    "# 简要查看过滤结果规模",
+    "print(f\"karma: {len(filtered_karma)} / {len(df_karma)} 保留 (threshold = {threshold})\")",
+    "print(f\"vina : {len(filtered_vina)} / {len(df_vina)} 保留 (threshold = {threshold})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# karma: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
+    "entry_counts_karma = (",
+    "    filtered_karma.groupby('Entry Name Base')",
+    "    .size()",
+    "    .rename('count')",
+    "    .reset_index()",
+    "    .sort_values('count', ascending=False)",
+    ")",
+    "best_per_entry_karma = (",
+    "    filtered_karma.sort_values('docking score', ascending=True)",
+    "    .groupby('Entry Name Base', as_index=False)",
+    "    .first()",
+    ")",
+    "dedup_with_counts_karma = best_per_entry_karma.merge(entry_counts_karma, on='Entry Name Base', how='left')",
+    "",
+    "print(f\"karma: {len(filtered_karma)} 条，Entry Name 去重后: {len(dedup_with_counts_karma)}\")",
+    "entry_counts_karma.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# vina: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
+    "entry_counts_vina = (",
+    "    filtered_vina.groupby('Entry Name Base')",
+    "    .size()",
+    "    .rename('count')",
+    "    .reset_index()",
+    "    .sort_values('count', ascending=False)",
+    ")",
+    "best_per_entry_vina = (",
+    "    filtered_vina.sort_values('docking score', ascending=True)",
+    "    .groupby('Entry Name Base', as_index=False)",
+    "    .first()",
+    ")",
+    "dedup_with_counts_vina = best_per_entry_vina.merge(entry_counts_vina, on='Entry Name Base', how='left')",
+    "",
+    "print(f\"vina : {len(filtered_vina)} 条，Entry Name 去重后: {len(dedup_with_counts_vina)}\")",
+    "entry_counts_vina.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# karma 与 vina 的 Entry Name 交集 (并展示各自 count)",
+    "karma_names = set(entry_counts_karma['Entry Name Base'])",
+    "vina_names = set(entry_counts_vina['Entry Name Base'])",
+    "common_names = sorted(karma_names & vina_names)",
+    "",
+    "common_df = (",
+    "    pd.DataFrame({'Entry Name Base': common_names})",
+    "    .merge(entry_counts_karma.rename(columns={'count': 'karma_count'}), on='Entry Name Base', how='left')",
+    "    .merge(entry_counts_vina.rename(columns={'count': 'vina_count'}), on='Entry Name Base', how='left')",
+    "    .sort_values(['karma_count', 'vina_count', 'Entry Name Base'], ascending=[False, False, True])",
+    ")",
+    "print(f\"交集个数: {len(common_df)}\")",
+    "common_df.head(20)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "vina",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/scripts/merge_csv.py b/scripts/merge_csv.py
deleted file mode 100644
index 9866814..0000000
--- a/scripts/merge_csv.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# 合并 COCOUNT 与 plant 的对接结果。
-from pathlib import Path
-import pandas as pd
-
-data = {
-    "plant": ["qed_values_poses_trpe_converted.csv", "qed_values_poses_fgbar_converted.csv"],
-    "cocount": ["qed_values_trpe.csv", "qed_values_fgbar.csv"]
-}
-
-df_trpe_plant1 = pd.read_csv(data["plant"][0])
-df_fgbar_plant2 = pd.read_csv(data["plant"][1])
-df_trpe_plant1["source"] = "plant"
-df_fgbar_plant2["source"] = "plant"
-df_trpe_cocount1 = pd.read_csv(data["cocount"][0])
-df_fgbar_cocount2 = pd.read_csv(data["cocount"][1])
-df_trpe_cocount1["source"] = "cocount"
-df_fgbar_cocount2["source"] = "cocount"
-
-df_trpe = pd.concat([df_trpe_plant1, df_trpe_cocount1])
-df_fgbar = pd.concat([df_fgbar_plant2, df_fgbar_cocount2])
-
-df_trpe.to_csv("qed_values_poses_trpe_all.csv", index=False)
-df_fgbar.to_csv("qed_values_poses_fgbar_all.csv", index=False) # 41166 条数据
\ No newline at end of file