Files
vinatools/scripts/analyze_results.py
2025-10-15 20:14:12 +08:00

152 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
分析AutoDock Vina对接结果脚本
用法: python analyze_results.py poses_directory output_csv
"""
import os
import sys
import pandas as pd
import re
from pathlib import Path
def parse_vina_output(pdbqt_file):
"""解析Vina输出文件提取能量信息"""
results = []
try:
with open(pdbqt_file, 'r') as f:
lines = f.readlines()
ligand_name = Path(pdbqt_file).stem.replace('_out', '')
model_num = 0
for line in lines:
if line.startswith('REMARK VINA RESULT:'):
# 解析能量信息
# 格式: REMARK VINA RESULT: -8.5 0.000 0.000
parts = line.strip().split()
if len(parts) >= 4:
binding_energy = float(parts[3])
rmsd_lb = float(parts[4]) if len(parts) > 4 else 0.0
rmsd_ub = float(parts[5]) if len(parts) > 5 else 0.0
model_num += 1
results.append({
'ligand_name': ligand_name,
'model': model_num,
'binding_energy': binding_energy,
'rmsd_lb': rmsd_lb,
'rmsd_ub': rmsd_ub,
'file_path': pdbqt_file
})
except Exception as e:
print(f"解析文件失败 {pdbqt_file}: {e}")
return results
def analyze_poses_directory(poses_dir):
"""分析整个poses目录"""
all_results = []
poses_path = Path(poses_dir)
if not poses_path.exists():
print(f"错误: 目录不存在 {poses_dir}")
return []
# 查找所有_out.pdbqt文件
pdbqt_files = list(poses_path.glob("*_out.pdbqt"))
print(f"找到 {len(pdbqt_files)} 个结果文件")
for pdbqt_file in pdbqt_files:
results = parse_vina_output(str(pdbqt_file))
all_results.extend(results)
return all_results
def generate_summary_stats(df):
"""生成汇总统计信息"""
if df.empty:
return {}
# 获取每个配体的最佳结合能
best_poses = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()]
stats = {
'total_ligands': df['ligand_name'].nunique(),
'total_poses': len(df),
'best_binding_energy': df['binding_energy'].min(),
'worst_binding_energy': df['binding_energy'].max(),
'mean_binding_energy': df['binding_energy'].mean(),
'median_binding_energy': df['binding_energy'].median(),
'std_binding_energy': df['binding_energy'].std(),
'ligands_better_than_minus_7': (best_poses['binding_energy'] < -7.0).sum(),
'ligands_better_than_minus_8': (best_poses['binding_energy'] < -8.0).sum(),
'ligands_better_than_minus_9': (best_poses['binding_energy'] < -9.0).sum(),
}
return stats
def main():
if len(sys.argv) != 3:
print("用法: python analyze_results.py <poses目录> <输出CSV文件>")
sys.exit(1)
poses_dir = sys.argv[1]
output_csv = sys.argv[2]
print("开始分析对接结果...")
# 分析所有结果
results = analyze_poses_directory(poses_dir)
if not results:
print("没有找到有效的结果文件")
sys.exit(1)
# 转换为DataFrame
df = pd.DataFrame(results)
# 保存详细结果
df.to_csv(output_csv, index=False)
print(f"详细结果已保存到: {output_csv}")
# 生成汇总统计
stats = generate_summary_stats(df)
# 打印汇总信息
print("\n=== 对接结果汇总 ===")
print(f"总配体数量: {stats['total_ligands']}")
print(f"总构象数量: {stats['total_poses']}")
print(f"最佳结合能: {stats['best_binding_energy']:.2f} kcal/mol")
print(f"最差结合能: {stats['worst_binding_energy']:.2f} kcal/mol")
print(f"平均结合能: {stats['mean_binding_energy']:.2f} kcal/mol")
print(f"中位结合能: {stats['median_binding_energy']:.2f} kcal/mol")
print(f"标准差: {stats['std_binding_energy']:.2f} kcal/mol")
print(f"结合能 < -7.0 的配体: {stats['ligands_better_than_minus_7']}")
print(f"结合能 < -8.0 的配体: {stats['ligands_better_than_minus_8']}")
print(f"结合能 < -9.0 的配体: {stats['ligands_better_than_minus_9']}")
# 显示前10个最佳结果
best_results = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()]
best_results = best_results.sort_values('binding_energy').head(10)
print("\n=== 前10个最佳结果 ===")
for _, row in best_results.iterrows():
print(f"{row['ligand_name']}: {row['binding_energy']:.2f} kcal/mol")
# 保存汇总统计
summary_file = output_csv.replace('.csv', '_summary.txt')
with open(summary_file, 'w') as f:
f.write("AutoDock Vina 对接结果汇总\n")
f.write("=" * 30 + "\n\n")
for key, value in stats.items():
f.write(f"{key}: {value}\n")
print(f"\n汇总统计已保存到: {summary_file}")
if __name__ == "__main__":
main()