#!/usr/bin/env python3 """ 分析AutoDock Vina对接结果脚本 用法: python analyze_results.py poses_directory output_csv """ import os import sys import pandas as pd import re from pathlib import Path def parse_vina_output(pdbqt_file): """解析Vina输出文件,提取能量信息""" results = [] try: with open(pdbqt_file, 'r') as f: lines = f.readlines() ligand_name = Path(pdbqt_file).stem.replace('_out', '') model_num = 0 for line in lines: if line.startswith('REMARK VINA RESULT:'): # 解析能量信息 # 格式: REMARK VINA RESULT: -8.5 0.000 0.000 parts = line.strip().split() if len(parts) >= 4: binding_energy = float(parts[3]) rmsd_lb = float(parts[4]) if len(parts) > 4 else 0.0 rmsd_ub = float(parts[5]) if len(parts) > 5 else 0.0 model_num += 1 results.append({ 'ligand_name': ligand_name, 'model': model_num, 'binding_energy': binding_energy, 'rmsd_lb': rmsd_lb, 'rmsd_ub': rmsd_ub, 'file_path': pdbqt_file }) except Exception as e: print(f"解析文件失败 {pdbqt_file}: {e}") return results def analyze_poses_directory(poses_dir): """分析整个poses目录""" all_results = [] poses_path = Path(poses_dir) if not poses_path.exists(): print(f"错误: 目录不存在 {poses_dir}") return [] # 查找所有_out.pdbqt文件 pdbqt_files = list(poses_path.glob("*_out.pdbqt")) print(f"找到 {len(pdbqt_files)} 个结果文件") for pdbqt_file in pdbqt_files: results = parse_vina_output(str(pdbqt_file)) all_results.extend(results) return all_results def generate_summary_stats(df): """生成汇总统计信息""" if df.empty: return {} # 获取每个配体的最佳结合能 best_poses = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()] stats = { 'total_ligands': df['ligand_name'].nunique(), 'total_poses': len(df), 'best_binding_energy': df['binding_energy'].min(), 'worst_binding_energy': df['binding_energy'].max(), 'mean_binding_energy': df['binding_energy'].mean(), 'median_binding_energy': df['binding_energy'].median(), 'std_binding_energy': df['binding_energy'].std(), 'ligands_better_than_minus_7': (best_poses['binding_energy'] < -7.0).sum(), 'ligands_better_than_minus_8': (best_poses['binding_energy'] < -8.0).sum(), 'ligands_better_than_minus_9': (best_poses['binding_energy'] < -9.0).sum(), } return stats def main(): if len(sys.argv) != 3: print("用法: python analyze_results.py <输出CSV文件>") sys.exit(1) poses_dir = sys.argv[1] output_csv = sys.argv[2] print("开始分析对接结果...") # 分析所有结果 results = analyze_poses_directory(poses_dir) if not results: print("没有找到有效的结果文件") sys.exit(1) # 转换为DataFrame df = pd.DataFrame(results) # 保存详细结果 df.to_csv(output_csv, index=False) print(f"详细结果已保存到: {output_csv}") # 生成汇总统计 stats = generate_summary_stats(df) # 打印汇总信息 print("\n=== 对接结果汇总 ===") print(f"总配体数量: {stats['total_ligands']}") print(f"总构象数量: {stats['total_poses']}") print(f"最佳结合能: {stats['best_binding_energy']:.2f} kcal/mol") print(f"最差结合能: {stats['worst_binding_energy']:.2f} kcal/mol") print(f"平均结合能: {stats['mean_binding_energy']:.2f} kcal/mol") print(f"中位结合能: {stats['median_binding_energy']:.2f} kcal/mol") print(f"标准差: {stats['std_binding_energy']:.2f} kcal/mol") print(f"结合能 < -7.0 的配体: {stats['ligands_better_than_minus_7']}") print(f"结合能 < -8.0 的配体: {stats['ligands_better_than_minus_8']}") print(f"结合能 < -9.0 的配体: {stats['ligands_better_than_minus_9']}") # 显示前10个最佳结果 best_results = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()] best_results = best_results.sort_values('binding_energy').head(10) print("\n=== 前10个最佳结果 ===") for _, row in best_results.iterrows(): print(f"{row['ligand_name']}: {row['binding_energy']:.2f} kcal/mol") # 保存汇总统计 summary_file = output_csv.replace('.csv', '_summary.txt') with open(summary_file, 'w') as f: f.write("AutoDock Vina 对接结果汇总\n") f.write("=" * 30 + "\n\n") for key, value in stats.items(): f.write(f"{key}: {value}\n") print(f"\n汇总统计已保存到: {summary_file}") if __name__ == "__main__": main()