From aef322a86da356bdcd9321a724bc1838c2d7cecf Mon Sep 17 00:00:00 2001 From: lingyuzeng Date: Tue, 5 Aug 2025 20:37:33 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E7=A7=BB=E5=8A=A8=E4=BD=8D?= =?UTF-8?q?=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/analyze_qed_mw_distribution.py | 15 +++----- scripts/calculate_qed_values.py | 9 +---- scripts/example_api_usage.py | 5 --- scripts/filter_qed_trpe.py | 53 ++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 22 deletions(-) create mode 100644 scripts/filter_qed_trpe.py diff --git a/scripts/analyze_qed_mw_distribution.py b/scripts/analyze_qed_mw_distribution.py index 56ddd73..1916f04 100644 --- a/scripts/analyze_qed_mw_distribution.py +++ b/scripts/analyze_qed_mw_distribution.py @@ -17,11 +17,6 @@ import logging import ast import json import click -import sys -import os - -# Add the parent directory to the path to import modules -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Setup logging logging.basicConfig(level=logging.INFO) @@ -125,8 +120,8 @@ def load_vina_scores_from_csv(df, max_files=1000): if processed_files >= max_files: break - # Skip reference molecules (those with align_ and _out_converted.sdf in their filename) - if 'align_' in row['filename'] and '_out_converted.sdf' in row['filename']: + # Skip reference molecules (those with mol2 extension) + if '.mol2' in row['filename']: continue try: @@ -179,8 +174,8 @@ def get_reference_vina_scores(dataset_name, rank=0): """ reference_scores = {} - # 使用更新后的路径以适应新目录结构 - reference_dir = Path("../result") / "refence" / dataset_name + # 使用原始目录名称 "refence" + reference_dir = Path("result") / "refence" / dataset_name if not reference_dir.exists(): logger.warning(f"Reference directory {reference_dir} does not exist") @@ -207,7 +202,7 @@ def get_reference_vina_scores(dataset_name, rank=0): if '_addH' in filename_stem: filename_stem = filename_stem.replace('_addH', '') if 'align_' in filename_stem: - filename_stem = filename_stem.split('_')[-1] + filename_stem = filename_stem.split('_')[-1] # Get the last part (e.g., 9NY or 0GA) # Use filename_stem as key for reference_scores reference_scores[filename_stem] = reference_score diff --git a/scripts/calculate_qed_values.py b/scripts/calculate_qed_values.py index 8d37241..8c39b5a 100644 --- a/scripts/calculate_qed_values.py +++ b/scripts/calculate_qed_values.py @@ -14,11 +14,6 @@ from rdkit.Chem.Descriptors import MolWt from pathlib import Path import logging import json -import sys -import os - -# Add the parent directory to the path to import modules -sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Setup logging logging.basicConfig(level=logging.INFO) @@ -133,7 +128,7 @@ def calculate_qed_for_poses_all(base_dir, dataset_name): 'filename': sdf_file.name, 'qed': qed_value, 'molecular_weight': mol_weight, - 'vina_scores': str(vina_scores) # 添加Vina得分列表 + 'vina_scores': vina_scores # 添加Vina得分列表 }) except Exception as e: logger.warning(f"Failed to calculate QED for {sdf_file}: {e}") @@ -222,7 +217,7 @@ def main(): Main function to calculate QED values for all molecules """ # Define base directories - result_dir = Path("../result") + result_dir = Path("result") # Process both datasets (fgbar and trpe) separately datasets = ["fgbar", "trpe"] diff --git a/scripts/example_api_usage.py b/scripts/example_api_usage.py index db5f265..f46b3ed 100644 --- a/scripts/example_api_usage.py +++ b/scripts/example_api_usage.py @@ -5,11 +5,6 @@ Example usage of the analyze_qed_mw_distribution API """ -import sys -import os -# Add the scripts directory to the path -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - from analyze_qed_mw_distribution import main_api print("Running analysis examples...") diff --git a/scripts/filter_qed_trpe.py b/scripts/filter_qed_trpe.py new file mode 100644 index 0000000..df31599 --- /dev/null +++ b/scripts/filter_qed_trpe.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +过滤 TRPE 分子数据脚本 +根据分子量 < 800 和 Vina 得分 < 6.5 进行过滤,并按得分升序排列 +然后根据 QED 值排名选择前 100 个分子 +""" + +import pandas as pd +import ast +import sys +import os + +def filter_trpe_data(input_file, output_file, top_n=100): + """ + 过滤 TRPE 数据 + :param input_file: 输入 CSV 文件路径 + :param output_file: 输出 CSV 文件路径 + :param top_n: 选取前 N 个分子(按 QED 排名) + """ + # 读取数据 + df = pd.read_csv(input_file) + + # 解析 vina_scores 字符串为列表 + df['vina_scores'] = df['vina_scores'].apply(ast.literal_eval) + + # 获取每个分子的最小 Vina 得分(最负值) + df['min_vina_score'] = df['vina_scores'].apply(min) + + # 应用过滤条件:分子量 < 800 且最小 Vina 得分 < -6.5 + # 注意:Vina 得分为负值,所以小于 -6.5 实际上是更好的结合能 + filtered_df = df[(df['molecular_weight'] < 800) & (df['min_vina_score'] < -6.5)] + + # 按照 QED 值降序排列并选择前 top_n 个分子 + top_qed_df = filtered_df.sort_values('qed', ascending=False).head(top_n) + + # 再按照最小 Vina 得分升序排列 + final_df = top_qed_df.sort_values('min_vina_score', ascending=True) + + # 保存结果到新的 CSV 文件 + final_df.to_csv(output_file, index=False) + + print(f"过滤完成:") + print(f" 原始数据: {len(df)} 条记录") + print(f" 分子量<800且Vina得分<-6.5: {len(filtered_df)} 条记录") + print(f" 按QED排名前{top_n}并按Vina得分排序: {len(final_df)} 条记录") + print(f" 输出文件: {output_file}") + +if __name__ == "__main__": + # 设置输入和输出文件路径 + input_csv = os.path.join(os.path.dirname(__file__), "qed_values_trpe.csv") + output_csv = os.path.join(os.path.dirname(__file__), "filtered_qed_trpe_top100.csv") + + filter_trpe_data(input_csv, output_csv, top_n=100) \ No newline at end of file