过滤在 karamadock 与 AutoDock vina 中聚类后前 1000 分子在 glide 对接的交集结果.(trpe 与 fgbar)

This commit is contained in:
2025-08-20 22:01:46 +08:00
parent 7ee5b18b99
commit 0d1d542919
2 changed files with 154 additions and 23 deletions

View File

@@ -0,0 +1,154 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 过滤与降重: docking score < -5.210,并分别按 Entry Name 统计 (karma 和 vina)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd",
"from pathlib import Path",
"import re",
"",
"# 配置",
"threshold = -5.210",
"",
"# 寻找仓库根目录 (包含 result 与 scripts 的文件夹)",
"def find_repo_root(start: Path) -> Path:",
" cur = start.resolve()",
" for _ in range(5):",
" if (cur / 'result').exists() and (cur / 'scripts').exists():",
" return cur",
" if cur.parent == cur:",
" break",
" cur = cur.parent",
" return start.resolve()",
"",
"root = find_repo_root(Path.cwd())",
"base_dir = root / 'result' / 'glide'",
"karma_path = base_dir / 'trpe_karma_score_aligned_top1000.csv'",
"vina_path = base_dir / 'trpe_vina_score_aligned_top1000.csv'",
"",
"# 读取",
"df_karma = pd.read_csv(karma_path)",
"df_vina = pd.read_csv(vina_path)",
"",
"# 过滤 docking score < -5.210 的行",
"filtered_karma = df_karma[df_karma['docking score'] < threshold].copy()",
"filtered_vina = df_vina[df_vina['docking score'] < threshold].copy()",
"",
"# 规范化 Entry Name去掉末尾的 .数字 版本后缀,如 .1 或 .1.1",
"def normalize_entry(s: str) -> str:",
" if not isinstance(s, str):",
" return s",
" return re.sub(r'(?:\\.\\d+)+$', '', s)",
"",
"for df in (filtered_karma, filtered_vina):",
" df['Entry Name Base'] = df['Entry Name'].astype(str).map(normalize_entry)",
"",
"# 简要查看过滤结果规模",
"print(f\"karma: {len(filtered_karma)} / {len(df_karma)} 保留 (threshold = {threshold})\")",
"print(f\"vina : {len(filtered_vina)} / {len(df_vina)} 保留 (threshold = {threshold})\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# karma: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
"entry_counts_karma = (",
" filtered_karma.groupby('Entry Name Base')",
" .size()",
" .rename('count')",
" .reset_index()",
" .sort_values('count', ascending=False)",
")",
"best_per_entry_karma = (",
" filtered_karma.sort_values('docking score', ascending=True)",
" .groupby('Entry Name Base', as_index=False)",
" .first()",
")",
"dedup_with_counts_karma = best_per_entry_karma.merge(entry_counts_karma, on='Entry Name Base', how='left')",
"",
"print(f\"karma: {len(filtered_karma)} 条Entry Name 去重后: {len(dedup_with_counts_karma)}\")",
"entry_counts_karma.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# vina: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
"entry_counts_vina = (",
" filtered_vina.groupby('Entry Name Base')",
" .size()",
" .rename('count')",
" .reset_index()",
" .sort_values('count', ascending=False)",
")",
"best_per_entry_vina = (",
" filtered_vina.sort_values('docking score', ascending=True)",
" .groupby('Entry Name Base', as_index=False)",
" .first()",
")",
"dedup_with_counts_vina = best_per_entry_vina.merge(entry_counts_vina, on='Entry Name Base', how='left')",
"",
"print(f\"vina : {len(filtered_vina)} 条Entry Name 去重后: {len(dedup_with_counts_vina)}\")",
"entry_counts_vina.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# karma 与 vina 的 Entry Name 交集 (并展示各自 count)",
"karma_names = set(entry_counts_karma['Entry Name Base'])",
"vina_names = set(entry_counts_vina['Entry Name Base'])",
"common_names = sorted(karma_names & vina_names)",
"",
"common_df = (",
" pd.DataFrame({'Entry Name Base': common_names})",
" .merge(entry_counts_karma.rename(columns={'count': 'karma_count'}), on='Entry Name Base', how='left')",
" .merge(entry_counts_vina.rename(columns={'count': 'vina_count'}), on='Entry Name Base', how='left')",
" .sort_values(['karma_count', 'vina_count', 'Entry Name Base'], ascending=[False, False, True])",
")",
"print(f\"交集个数: {len(common_df)}\")",
"common_df.head(20)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "vina",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -1,23 +0,0 @@
# 合并 COCOUNT 与 plant 的对接结果。
from pathlib import Path
import pandas as pd
data = {
"plant": ["qed_values_poses_trpe_converted.csv", "qed_values_poses_fgbar_converted.csv"],
"cocount": ["qed_values_trpe.csv", "qed_values_fgbar.csv"]
}
df_trpe_plant1 = pd.read_csv(data["plant"][0])
df_fgbar_plant2 = pd.read_csv(data["plant"][1])
df_trpe_plant1["source"] = "plant"
df_fgbar_plant2["source"] = "plant"
df_trpe_cocount1 = pd.read_csv(data["cocount"][0])
df_fgbar_cocount2 = pd.read_csv(data["cocount"][1])
df_trpe_cocount1["source"] = "cocount"
df_fgbar_cocount2["source"] = "cocount"
df_trpe = pd.concat([df_trpe_plant1, df_trpe_cocount1])
df_fgbar = pd.concat([df_fgbar_plant2, df_fgbar_cocount2])
df_trpe.to_csv("qed_values_poses_trpe_all.csv", index=False)
df_fgbar.to_csv("qed_values_poses_fgbar_all.csv", index=False) # 41166 条数据