import pandas as pd from pathlib import Path import os def process_cluster_file(cluster_file, score_file, output_file): # 检查文件是否存在 if not os.path.exists(cluster_file): raise FileNotFoundError(f"聚类文件不存在: {cluster_file}") if not os.path.exists(score_file): raise FileNotFoundError(f"评分文件不存在: {score_file}") # 读取聚类结果文件 cluster_df = pd.read_csv(cluster_file) # 提取filename列的stem属性 cluster_df['filename_stem'] = cluster_df['filename'].apply( lambda x: Path(x).stem.split('_out')[0] ) # 读取score文件 score_df = pd.read_csv(score_file) # 获取两个文件的交集 intersection = pd.merge( cluster_df, score_df, left_on='filename_stem', right_on='pdb_id', how='inner' ) # 保存结果 intersection.to_csv(output_file, index=False) return len(intersection) if __name__ == "__main__": # 使用绝对路径确保文件位置正确 base_dir = "/Users/lingyuzeng/Downloads/211.69.141.180/202508021824/vina" # 处理fgbar数据 fgbar_count = process_cluster_file( f"{base_dir}/scripts/finally_data/cluster_best/fgbar_cluster_best_vina_butina_butina.csv", f"{base_dir}/result/karamadock/FgBar1_score.csv", f"{base_dir}/scripts/finally_data/cluster_best/fgbar_intersection.csv" ) # 处理trpe数据 trpe_count = process_cluster_file( f"{base_dir}/scripts/finally_data/cluster_best/trpe_cluster_best_vina_butina_butina.csv", f"{base_dir}/result/karamadock/TrpE_score.csv", f"{base_dir}/scripts/finally_data/cluster_best/trpe_intersection.csv" ) print(f"fgbar交集数量: {fgbar_count}") print(f"trpe交集数量: {trpe_count}") # 验证输出文件是否生成 print("脚本执行完成")