过滤在 karamadock 与 AutoDock vina 中聚类后前 1000 分子在 glide 对接的交集结果.(trpe 与 fgbar)
This commit is contained in:
154
scripts/jupyter/filter_docking_and_dedupe.ipynb
Normal file
154
scripts/jupyter/filter_docking_and_dedupe.ipynb
Normal file
@@ -0,0 +1,154 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# 过滤与降重: docking score < -5.210,并分别按 Entry Name 统计 (karma 和 vina)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd",
|
||||
"from pathlib import Path",
|
||||
"import re",
|
||||
"",
|
||||
"# 配置",
|
||||
"threshold = -5.210",
|
||||
"",
|
||||
"# 寻找仓库根目录 (包含 result 与 scripts 的文件夹)",
|
||||
"def find_repo_root(start: Path) -> Path:",
|
||||
" cur = start.resolve()",
|
||||
" for _ in range(5):",
|
||||
" if (cur / 'result').exists() and (cur / 'scripts').exists():",
|
||||
" return cur",
|
||||
" if cur.parent == cur:",
|
||||
" break",
|
||||
" cur = cur.parent",
|
||||
" return start.resolve()",
|
||||
"",
|
||||
"root = find_repo_root(Path.cwd())",
|
||||
"base_dir = root / 'result' / 'glide'",
|
||||
"karma_path = base_dir / 'trpe_karma_score_aligned_top1000.csv'",
|
||||
"vina_path = base_dir / 'trpe_vina_score_aligned_top1000.csv'",
|
||||
"",
|
||||
"# 读取",
|
||||
"df_karma = pd.read_csv(karma_path)",
|
||||
"df_vina = pd.read_csv(vina_path)",
|
||||
"",
|
||||
"# 过滤 docking score < -5.210 的行",
|
||||
"filtered_karma = df_karma[df_karma['docking score'] < threshold].copy()",
|
||||
"filtered_vina = df_vina[df_vina['docking score'] < threshold].copy()",
|
||||
"",
|
||||
"# 规范化 Entry Name(去掉末尾的 .数字 版本后缀,如 .1 或 .1.1)",
|
||||
"def normalize_entry(s: str) -> str:",
|
||||
" if not isinstance(s, str):",
|
||||
" return s",
|
||||
" return re.sub(r'(?:\\.\\d+)+$', '', s)",
|
||||
"",
|
||||
"for df in (filtered_karma, filtered_vina):",
|
||||
" df['Entry Name Base'] = df['Entry Name'].astype(str).map(normalize_entry)",
|
||||
"",
|
||||
"# 简要查看过滤结果规模",
|
||||
"print(f\"karma: {len(filtered_karma)} / {len(df_karma)} 保留 (threshold = {threshold})\")",
|
||||
"print(f\"vina : {len(filtered_vina)} / {len(df_vina)} 保留 (threshold = {threshold})\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# karma: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
|
||||
"entry_counts_karma = (",
|
||||
" filtered_karma.groupby('Entry Name Base')",
|
||||
" .size()",
|
||||
" .rename('count')",
|
||||
" .reset_index()",
|
||||
" .sort_values('count', ascending=False)",
|
||||
")",
|
||||
"best_per_entry_karma = (",
|
||||
" filtered_karma.sort_values('docking score', ascending=True)",
|
||||
" .groupby('Entry Name Base', as_index=False)",
|
||||
" .first()",
|
||||
")",
|
||||
"dedup_with_counts_karma = best_per_entry_karma.merge(entry_counts_karma, on='Entry Name Base', how='left')",
|
||||
"",
|
||||
"print(f\"karma: {len(filtered_karma)} 条,Entry Name 去重后: {len(dedup_with_counts_karma)}\")",
|
||||
"entry_counts_karma.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# vina: 按 Entry Name 统计重复次数并降重 (保留最优 docking score)",
|
||||
"entry_counts_vina = (",
|
||||
" filtered_vina.groupby('Entry Name Base')",
|
||||
" .size()",
|
||||
" .rename('count')",
|
||||
" .reset_index()",
|
||||
" .sort_values('count', ascending=False)",
|
||||
")",
|
||||
"best_per_entry_vina = (",
|
||||
" filtered_vina.sort_values('docking score', ascending=True)",
|
||||
" .groupby('Entry Name Base', as_index=False)",
|
||||
" .first()",
|
||||
")",
|
||||
"dedup_with_counts_vina = best_per_entry_vina.merge(entry_counts_vina, on='Entry Name Base', how='left')",
|
||||
"",
|
||||
"print(f\"vina : {len(filtered_vina)} 条,Entry Name 去重后: {len(dedup_with_counts_vina)}\")",
|
||||
"entry_counts_vina.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# karma 与 vina 的 Entry Name 交集 (并展示各自 count)",
|
||||
"karma_names = set(entry_counts_karma['Entry Name Base'])",
|
||||
"vina_names = set(entry_counts_vina['Entry Name Base'])",
|
||||
"common_names = sorted(karma_names & vina_names)",
|
||||
"",
|
||||
"common_df = (",
|
||||
" pd.DataFrame({'Entry Name Base': common_names})",
|
||||
" .merge(entry_counts_karma.rename(columns={'count': 'karma_count'}), on='Entry Name Base', how='left')",
|
||||
" .merge(entry_counts_vina.rename(columns={'count': 'vina_count'}), on='Entry Name Base', how='left')",
|
||||
" .sort_values(['karma_count', 'vina_count', 'Entry Name Base'], ascending=[False, False, True])",
|
||||
")",
|
||||
"print(f\"交集个数: {len(common_df)}\")",
|
||||
"common_df.head(20)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "vina",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
# 合并 COCOUNT 与 plant 的对接结果。
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
data = {
|
||||
"plant": ["qed_values_poses_trpe_converted.csv", "qed_values_poses_fgbar_converted.csv"],
|
||||
"cocount": ["qed_values_trpe.csv", "qed_values_fgbar.csv"]
|
||||
}
|
||||
|
||||
df_trpe_plant1 = pd.read_csv(data["plant"][0])
|
||||
df_fgbar_plant2 = pd.read_csv(data["plant"][1])
|
||||
df_trpe_plant1["source"] = "plant"
|
||||
df_fgbar_plant2["source"] = "plant"
|
||||
df_trpe_cocount1 = pd.read_csv(data["cocount"][0])
|
||||
df_fgbar_cocount2 = pd.read_csv(data["cocount"][1])
|
||||
df_trpe_cocount1["source"] = "cocount"
|
||||
df_fgbar_cocount2["source"] = "cocount"
|
||||
|
||||
df_trpe = pd.concat([df_trpe_plant1, df_trpe_cocount1])
|
||||
df_fgbar = pd.concat([df_fgbar_plant2, df_fgbar_cocount2])
|
||||
|
||||
df_trpe.to_csv("qed_values_poses_trpe_all.csv", index=False)
|
||||
df_fgbar.to_csv("qed_values_poses_fgbar_all.csv", index=False) # 41166 条数据
|
||||
Reference in New Issue
Block a user