聚类方法,聚类后选择打分最高那个分子,并对 karamadock 的结果求交集

This commit is contained in:
2025-08-18 15:50:12 +08:00
parent b85b02b5c3
commit 7ee5b18b99
12 changed files with 18046 additions and 4 deletions

View File

@@ -8,6 +8,40 @@ python scripts/cluster_granularity_scan.py \
--smiles-col smiles \
--radius 3 \
--n-bits 1024
Method Params #Clusters AvgSize AvgIntraSim
Butina {'cutoff': 0.4} 8960 2.10 0.706
Butina {'cutoff': 0.5} 6720 2.80 0.625
Butina {'cutoff': 0.6} 4648 4.04 0.548
Butina {'cutoff': 0.7} 2783 6.75 0.463
Butina {'cutoff': 0.8} 958 19.61 0.333
Hierarchical {'threshold': 0.3} 12235 1.54 0.814
Hierarchical {'threshold': 0.4} 9603 1.96 0.739
Hierarchical {'threshold': 0.5} 7300 2.57 0.664
DBSCAN {'eps': 0.2} 2050 3.18 0.106
DBSCAN {'eps': 0.3} 2275 4.61 0.113
DBSCAN {'eps': 0.4} 2014 6.65 0.113
KMeans {'k': 10} 10 1878.70 0.204
KMeans {'k': 20} 20 939.35 0.200
KMeans {'k': 50} 50 375.74 0.233
| 列名 | 含义 |
| --------------- | --------------------------------- |
| **#Clusters** | 聚类后得到的簇数量(独立 cluster 数) |
| **AvgSize** | 每个簇平均包含的分子个数 = 样本总数 / 簇数 |
| **AvgIntraSim** | 每个簇内部分子两两之间的平均相似度(越接近 1 代表簇内部更相似) |
现在的数据:
Butina 在 cutoff=0.4 时 AvgIntraSim=0.706(簇内结构还算比较接近,但簇数非常多)。
Hierarchical 阈值 0.3 时 AvgIntraSim=0.814(更紧密,但簇数更多)。
DBSCAN 和 KMeans 的簇内相似度都低,说明它们在 Tanimoto 上可能不太适合你这个任务。
聚类:用 Butina cutoff ≈ 0.60.7 或 Hierarchical 阈值 ≈ 0.50.6(保持簇内差异可控,簇数不要太多)。
选代表:每个簇取 1 个中心分子(簇内与其他成员平均相似度最高的那个)。
如果仍想增强多样性,可以在代表集中再跑一次 MaxMin picking。
"""
import sys, os
from pathlib import Path

View File

@@ -11,20 +11,20 @@ print("Running analysis examples...")
# Example 1: Basic usage
print("\nExample 1: Basic usage")
main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'])
main_api(['finally_data/qed_values_poses_fgbar_all.csv', 'finally_data/qed_values_poses_trpe_all.csv'], ['fgbar', 'trpe'])
# Example 2: With custom reference scores
print("\nExample 2: With custom reference scores")
main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'],
main_api(['finally_data/qed_values_poses_fgbar_all.csv', 'finally_data/qed_values_poses_trpe_all.csv'], ['fgbar', 'trpe'],
reference_scores={'fgbar': {'9NY': -5.268}, 'trpe': {'0GA': -6.531}})
# Example 3: With specific conformation rank
print("\nExample 3: With specific conformation rank")
main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'], rank=0)
main_api(['finally_data/qed_values_poses_fgbar_all.csv', 'finally_data/qed_values_poses_trpe_all.csv'], ['fgbar', 'trpe'], rank=0)
# Example 4: With both custom reference scores and specific conformation rank
print("\nExample 4: With both custom reference scores and specific conformation rank")
main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'],
main_api(['finally_data/qed_values_poses_fgbar_all.csv', 'finally_data/qed_values_poses_trpe_all.csv'], ['fgbar', 'trpe'],
reference_scores={'fgbar': {'9NY': -5.268}, 'trpe': {'0GA': -6.531}}, rank=0)
print("\nAnalysis complete! Check the generated PNG files.")

View File

@@ -0,0 +1,59 @@
import pandas as pd
from pathlib import Path
import os
def process_cluster_file(cluster_file, score_file, output_file):
# 检查文件是否存在
if not os.path.exists(cluster_file):
raise FileNotFoundError(f"聚类文件不存在: {cluster_file}")
if not os.path.exists(score_file):
raise FileNotFoundError(f"评分文件不存在: {score_file}")
# 读取聚类结果文件
cluster_df = pd.read_csv(cluster_file)
# 提取filename列的stem属性
cluster_df['filename_stem'] = cluster_df['filename'].apply(
lambda x: Path(x).stem.split('_out')[0]
)
# 读取score文件
score_df = pd.read_csv(score_file)
# 获取两个文件的交集
intersection = pd.merge(
cluster_df,
score_df,
left_on='filename_stem',
right_on='pdb_id',
how='inner'
)
# 保存结果
intersection.to_csv(output_file, index=False)
return len(intersection)
if __name__ == "__main__":
# 使用绝对路径确保文件位置正确
base_dir = "/Users/lingyuzeng/Downloads/211.69.141.180/202508021824/vina"
# 处理fgbar数据
fgbar_count = process_cluster_file(
f"{base_dir}/scripts/finally_data/cluster_best/fgbar_cluster_best_vina_butina_butina.csv",
f"{base_dir}/result/karamadock/FgBar1_score.csv",
f"{base_dir}/scripts/finally_data/cluster_best/fgbar_intersection.csv"
)
# 处理trpe数据
trpe_count = process_cluster_file(
f"{base_dir}/scripts/finally_data/cluster_best/trpe_cluster_best_vina_butina_butina.csv",
f"{base_dir}/result/karamadock/TrpE_score.csv",
f"{base_dir}/scripts/finally_data/cluster_best/trpe_intersection.csv"
)
print(f"fgbar交集数量: {fgbar_count}")
print(f"trpe交集数量: {trpe_count}")
# 验证输出文件是否生成
print("脚本执行完成")

View File

@@ -0,0 +1,64 @@
import pandas as pd
import os
import ast
import argparse
def parse_vina_scores(vina_scores_str):
"""解析vina_scores字符串为浮点数列表"""
try:
scores = ast.literal_eval(vina_scores_str)
if isinstance(scores, list) and len(scores) > 0:
return scores[0] # 取第一个值作为vina_score
return None
except:
return None
def extract_top_molecules(file_path, output_dir, dataset_name):
"""从CSV文件中提取karma_score_aligned和vina_score前1000的分子"""
# 读取数据
df = pd.read_csv(file_path)
# 解析vina_scores列
df['vina_score'] = df['vina_scores'].apply(parse_vina_scores)
# 按karma_score_aligned排序并提取前1000
df_karma_top = df.sort_values('karma_score_aligned', ascending=False).head(1000)
# 按vina_score排序并提取前1000
df_vina_top = df.sort_values('vina_score', ascending=False).head(1000)
# 保存结果
karma_output_file = os.path.join(output_dir, f"{dataset_name}_karma_score_aligned_top1000.csv")
vina_output_file = os.path.join(output_dir, f"{dataset_name}_vina_score_top1000.csv")
df_karma_top.to_csv(karma_output_file, index=False)
df_vina_top.to_csv(vina_output_file, index=False)
print(f"{dataset_name} - karma_score_aligned前1000分子保存到: {karma_output_file}")
print(f"{dataset_name} - vina_score前1000分子保存到: {vina_output_file}")
print(f"{dataset_name} - karma_score_aligned前1000分子数量: {len(df_karma_top)}")
print(f"{dataset_name} - vina_score前1000分子数量: {len(df_vina_top)}")
return df_karma_top, df_vina_top
def main():
parser = argparse.ArgumentParser(description='从CSV文件中提取karma_score_aligned和vina_score前1000的分子')
parser.add_argument('--input', nargs='+', required=True,
help='输入CSV文件路径列表')
parser.add_argument('--dataset-names', nargs='+', required=True,
help='数据集名称列表,与输入文件一一对应')
parser.add_argument('--output', required=True,
help='输出目录')
args = parser.parse_args()
# 确保输出目录存在
os.makedirs(args.output, exist_ok=True)
# 处理每个文件
for file_path, dataset_name in zip(args.input, args.dataset_names):
print(f"Processing {dataset_name}...")
extract_top_molecules(file_path, args.output, dataset_name)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff