update

2025-10-15 20:14:12 +08:00
parent f4afbb7712
commit 7ac0e58599
16 changed files with 6085 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,59 @@
+# Git 相关
+.git
+.gitignore
+.gitattributes
+
+# Docker 相关
+docker/
+.dockerignore
+
+# Python 相关
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# 数据文件
+data/
+results/
+output/
+*.pdb
+*.sdf
+*.mol2
+*.xyz
+
+# 日志文件
+*.log
+logs/
+
+# 临时文件
+*.tmp
+*.temp
+.DS_Store
+Thumbs.db
+
+# IDE 相关
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# 系统文件
+.DS_Store
+Thumbs.db
+
+# 文档
+*.md
+docs/
+
+# 测试文件
+test/
+tests/
+*_test.py
+test_*.py
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# SCM syntax highlighting & preventing 3-way merges
+pixi.lock merge=binary linguist-language=YAML linguist-generated=true
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
+# pixi environments
+.pixi/*
+!.pixi/config.toml
+bin/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,141 @@
+## 环境管理 (使用 pixi)
+
+### 安装 pixi
+
+```bash
+# 安装 pixi
+curl -fsSL https://pixi.sh/install.sh | bash
+
+# 重新加载 shell 配置
+source ~/.bashrc  # 或 source ~/.zshrc
+```
+
+### 初始化项目环境
+
+```bash
+# 在项目目录中初始化 pixi 环境
+pixi init
+
+# 添加所需的包
+pixi add rdkit openbabel meeko
+
+# 激活环境
+pixi shell
+```
+
+### 使用环境
+
+```bash
+# 激活 pixi 环境
+pixi shell
+
+# 在环境中运行脚本
+pixi run python scripts/your_script.py
+
+# 或者直接使用 pixi 执行命令
+pixi run vina --help
+```
+
+## AutoDock Vina 安装
+
+### 下载二进制文件
+
+[Download](https://github.com/ccsb-scripps/AutoDock-Vina/releases/tag/v1.2.7)
+
+for macos:
+
+```bash
+
+wget -O ./bin/vina_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_1.2.7_mac_aarch64
+wget -O ./bin/vina_split_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_split_1.2.7_mac_aarch64
+
+# 或者使用 curl:
+curl -L -o ./bin/vina_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_1.2.7_mac_aarch64
+curl -L -o ./bin/vina_split_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_split_1.2.7_mac_aarch64
+
+chmod +x ./bin/vina_*
+```
+
+## 项目使用
+
+### 快速开始
+
+```bash
+# 1. 克隆项目
+git clone <your-repo-url>
+cd vinatools
+
+# 2. 初始化 pixi 环境
+pixi init
+pixi add rdkit openbabel meeko
+
+# 3. 下载 AutoDock Vina 二进制文件
+mkdir -p bin
+curl -L -o ./bin/vina_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_1.2.7_mac_aarch64
+curl -L -o ./bin/vina_split_1.2.7_mac_aarch64 https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.7/vina_split_1.2.7_mac_aarch64
+chmod +x ./bin/vina_*
+
+# 4. 激活环境并运行
+pixi shell
+```
+
+### 环境管理命令
+
+```bash
+# 查看已安装的包
+pixi list
+
+# 添加新包
+pixi add package_name
+
+# 移除包
+pixi remove package_name
+
+# 更新所有包
+pixi update
+
+# 导出环境配置
+pixi export --format conda-lock
+```
+
+### 项目结构
+
+```
+vinatools/
+├── bin/                    # AutoDock Vina 二进制文件
+│   ├── vina_1.2.7_mac_aarch64
+│   └── vina_split_1.2.7_mac_aarch64
+├── scripts/                # Python 脚本
+│   ├── batch_prepare_ligands.sh
+│   ├── batch_docking.sh
+│   ├── calculate_qed_values.py
+│   └── analyze_results.py
+├── pixi.toml               # pixi 环境配置文件
+├── docker/                 # Docker 配置文件
+│   ├── Dockerfile
+│   ├── docker-compose.yml
+│   └── README.md
+└── README.md
+```
+
+## Docker 环境
+
+### 快速使用 Docker
+
+```bash
+# 构建并运行 Docker 容器
+docker-compose -f docker/docker-compose.yml up -d
+
+# 进入容器
+docker-compose -f docker/docker-compose.yml exec vinatools bash
+
+# 运行脚本
+docker-compose -f docker/docker-compose.yml exec vinatools pixi run python scripts/calculate_qed_values.py
+```
+
+### Docker 服务说明
+
+- **vinatools**: 主服务，包含 pixi 环境和所有依赖包
+- **jupyter**: Jupyter Notebook 服务，访问 http://localhost:8888
+
+详细使用说明请参考 [docker/README.md](docker/README.md)
--- a/config/example.txt
+++ b/config/example.txt
@@ -0,0 +1,6 @@
+center_x = -12.7
+center_y = -9.1
+center_z = -0.3
+size_x = 49.1
+size_y = 37.6
+size_z = 35.2
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -0,0 +1,69 @@
+# 使用 Ubuntu 22.04 作为基础镜像（使用腾讯云镜像源）
+FROM ccr.ccs.tencentyun.com/library/ubuntu:22.04
+
+# 设置环境变量
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH="/root/.pixi/bin:$PATH"
+
+# 配置 APT 镜像源（使用阿里云镜像）
+RUN sed -i 's@//.*archive.ubuntu.com@//mirrors.aliyun.com@g' /etc/apt/sources.list && \
+    sed -i 's@//.*security.ubuntu.com@//mirrors.aliyun.com@g' /etc/apt/sources.list
+
+# 安装系统依赖
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    build-essential \
+    ca-certificates \
+    gnupg \
+    lsb-release \
+    && rm -rf /var/lib/apt/lists/*
+
+# 配置 pip 镜像源
+RUN mkdir -p /root/.pip && \
+    echo "[global]" > /root/.pip/pip.conf && \
+    echo "index-url = https://pypi.tuna.tsinghua.edu.cn/simple" >> /root/.pip/pip.conf && \
+    echo "trusted-host = pypi.tuna.tsinghua.edu.cn" >> /root/.pip/pip.conf
+
+# 安装 pixi
+RUN curl -fsSL https://pixi.sh/install.sh | bash
+
+# 设置工作目录
+WORKDIR /app
+
+# 复制项目文件
+COPY . .
+
+# 创建 bin 目录
+RUN mkdir -p bin
+
+# 下载 AutoDock Vina 二进制文件
+ARG VINA_VERSION=1.2.7
+ARG VINA_PLATFORM=mac_aarch64
+ARG DOWNLOAD_VINA=true
+
+RUN if [ "$DOWNLOAD_VINA" = "true" ]; then \
+        curl -L -o ./bin/vina_${VINA_VERSION}_${VINA_PLATFORM} https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v${VINA_VERSION}/vina_${VINA_VERSION}_${VINA_PLATFORM} && \
+        curl -L -o ./bin/vina_split_${VINA_VERSION}_${VINA_PLATFORM} https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v${VINA_VERSION}/vina_split_${VINA_VERSION}_${VINA_PLATFORM} && \
+        chmod +x ./bin/vina_*; \
+    fi
+
+# 添加平台支持并安装 pixi 包
+RUN /root/.pixi/bin/pixi workspace platform add linux-aarch64 && \
+    /root/.pixi/bin/pixi add rdkit openbabel meeko
+
+# 设置环境变量
+ENV PATH="/root/.pixi/bin:/app/bin:$PATH"
+
+# 创建启动脚本
+RUN echo '#!/bin/bash\n\
+source /root/.bashrc\n\
+exec "$@"' > /entrypoint.sh && \
+    chmod +x /entrypoint.sh
+
+# 设置入口点
+ENTRYPOINT ["/entrypoint.sh"]
+
+# 默认命令
+CMD ["/root/.pixi/bin/pixi", "shell"]
--- a/docker/README.md
+++ b/docker/README.md
@@ -0,0 +1,201 @@
+# Docker 环境使用说明
+
+## 快速开始
+
+### 1. 环境变量配置
+
+```bash
+# 复制环境变量模板
+cp docker/docker.env.example docker/.env
+
+# 编辑环境变量
+vim docker/.env
+```
+
+### 2. 构建镜像
+
+```bash
+# 使用默认配置构建
+docker-compose -f docker/docker-compose.yml build
+
+# 使用环境变量构建
+docker-compose -f docker/docker-compose.yml --env-file docker/.env build
+
+# 或者直接使用 docker build
+docker build -f docker/Dockerfile -t vinatools:latest .
+```
+
+### 3. 环境变量说明
+
+| 变量名 | 默认值 | 说明 |
+|--------|--------|------|
+| `VINA_VERSION` | `1.2.7` | AutoDock Vina 版本 |
+| `VINA_PLATFORM` | `mac_aarch64` | 平台架构 |
+| `DOWNLOAD_VINA` | `true` | 是否下载 AutoDock Vina |
+
+**支持的平台：**
+- `mac_aarch64` - Apple Silicon Mac
+- `mac_x86_64` - Intel Mac  
+- `linux_x86_64` - Linux x86_64
+- `windows_x86_64` - Windows x86_64
+
+### 4. 运行容器
+
+```bash
+# 启动主服务
+docker-compose -f docker/docker-compose.yml up -d vinatools
+
+# 进入容器
+docker-compose -f docker/docker-compose.yml exec vinatools bash
+
+# 或者直接运行
+docker run -it --rm -v $(pwd):/app vinatools:latest bash
+```
+
+## 使用示例
+
+### 不同平台构建
+
+```bash
+# Linux x86_64 平台
+VINA_PLATFORM=linux_x86_64 docker-compose -f docker/docker-compose.yml build
+VINA_PLATFORM=linux_aarch64 docker-compose -f docker/docker-compose.yml build
+
+# Intel Mac 平台
+VINA_PLATFORM=mac_x86_64 docker-compose -f docker/docker-compose.yml build
+
+# 不下载 AutoDock Vina
+DOWNLOAD_VINA=false docker-compose -f docker/docker-compose.yml build
+```
+
+### 使用环境文件
+
+```bash
+# 创建自定义环境文件
+cat > docker/my.env << EOF
+VINA_VERSION=1.2.6
+VINA_PLATFORM=linux_x86_64
+DOWNLOAD_VINA=true
+EOF
+
+# 使用自定义环境文件构建
+docker-compose -f docker/docker-compose.yml --env-file docker/my.env build
+```
+
+### 3. 使用 Jupyter Notebook
+
+```bash
+# 启动 Jupyter 服务
+docker-compose -f docker/docker-compose.yml up -d jupyter
+
+# 访问 http://localhost:8888
+```
+
+## 环境说明
+
+### 镜像源配置
+为了加速构建过程，Dockerfile 中配置了以下镜像源：
+
+- **APT 源**: 阿里云镜像 (mirrors.aliyun.com)
+- **pip 源**: 清华大学镜像 (pypi.tuna.tsinghua.edu.cn)
+- **conda 源**: 清华大学镜像 (mirrors.tuna.tsinghua.edu.cn)
+
+### 包含的包
+- **rdkit**: 化学信息学工具包
+- **openbabel**: 分子格式转换工具
+- **meeko**: 分子准备工具
+- **AutoDock Vina**: 分子对接工具
+
+### 目录结构
+```
+/app/
+├── bin/                    # AutoDock Vina 二进制文件
+├── scripts/               # Python 脚本
+├── data/                  # 输入数据（挂载）
+└── results/               # 输出结果（挂载）
+```
+
+## 常用命令
+
+### 运行脚本
+```bash
+# 在容器中运行 Python 脚本
+pixi run python scripts/calculate_qed_values.py
+
+# 运行批处理脚本
+pixi run bash scripts/batch_docking.sh
+```
+
+### 数据管理
+```bash
+# 挂载数据目录
+docker run -it --rm \
+  -v $(pwd)/data:/app/data \
+  -v $(pwd)/results:/app/results \
+  vinatools:latest bash
+```
+
+### 清理
+```bash
+# 停止所有服务
+docker-compose -f docker/docker-compose.yml down
+
+# 删除镜像
+docker rmi vinatools:latest
+
+# 清理未使用的资源
+docker system prune -a
+```
+
+## 故障排除
+
+### 网络连接问题
+如果遇到网络连接问题，可以尝试以下解决方案：
+
+```bash
+# 1. 使用代理构建
+docker build --build-arg HTTP_PROXY=http://proxy:port \
+             --build-arg HTTPS_PROXY=http://proxy:port \
+             -f docker/Dockerfile -t vinatools:latest .
+
+# 2. 使用不同的镜像源
+# 编辑 Dockerfile，替换镜像源：
+# - APT: mirrors.ustc.edu.cn (中科大)
+# - pip: pypi.douban.com (豆瓣)
+# - conda: mirrors.ustc.edu.cn/anaconda/cloud/
+
+# 3. 离线构建（如果网络完全不可用）
+# 预先下载所有依赖包，然后使用本地构建
+```
+
+### 权限问题
+```bash
+# 修复文件权限
+sudo chown -R $USER:$USER data/ results/
+```
+
+### 内存不足
+```bash
+# 增加 Docker 内存限制
+docker run -it --rm --memory=8g vinatools:latest bash
+```
+
+### 网络问题
+```bash
+# 使用主机网络
+docker run -it --rm --network=host vinatools:latest bash
+```
+
+### 镜像源切换
+如果需要使用其他镜像源，可以修改 Dockerfile 中的配置：
+
+```dockerfile
+# APT 源切换
+RUN sed -i 's@//.*archive.ubuntu.com@//mirrors.ustc.edu.cn@g' /etc/apt/sources.list
+
+# pip 源切换
+RUN echo "index-url = https://pypi.douban.com/simple" > /root/.pip/pip.conf
+
+# conda 源切换
+RUN /root/.pixi/bin/pixi config set channels https://mirrors.ustc.edu.cn/anaconda/cloud/conda-forge/
+```
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -0,0 +1,70 @@
+version: '3.8'
+
+services:
+  vinatools:
+    build:
+      context: ..
+      dockerfile: docker/Dockerfile
+      args:
+        VINA_VERSION: ${VINA_VERSION:-1.2.7}
+        VINA_PLATFORM: ${VINA_PLATFORM:-linux}
+        DOWNLOAD_VINA: ${DOWNLOAD_VINA:-true}
+    image: vinatools:latest
+    container_name: vinatools-container
+    volumes:
+      # 挂载项目目录到容器
+      - ..:/app
+      # 挂载数据目录（用于输入输出文件）
+      - ./data:/app/data
+      - ./results:/app/results
+    working_dir: /app
+    environment:
+      - PIXI_ROOT=/root/.pixi
+      - PATH=/root/.pixi/bin:/app/bin:$PATH
+    # 保持容器运行
+    tty: true
+    stdin_open: true
+    # 网络模式
+    network_mode: host
+    # 重启策略
+    restart: unless-stopped
+    # 资源限制
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+          cpus: '2.0'
+        reservations:
+          memory: 2G
+          cpus: '1.0'
+
+  # 可选：用于 Jupyter Notebook 服务
+  jupyter:
+    build:
+      context: ..
+      dockerfile: docker/Dockerfile
+      args:
+        VINA_VERSION: ${VINA_VERSION:-1.2.7}
+        VINA_PLATFORM: ${VINA_PLATFORM:-linux_x86_64}
+        DOWNLOAD_VINA: ${DOWNLOAD_VINA:-true}
+    image: vinatools:latest
+    container_name: vinatools-jupyter
+    ports:
+      - "8888:8888"
+    volumes:
+      - ..:/app
+      - ./data:/app/data
+      - ./results:/app/results
+    working_dir: /app
+    environment:
+      - PIXI_ROOT=/root/.pixi
+      - PATH=/root/.pixi/bin:/app/bin:$PATH
+    command: >
+      bash -c "
+        /root/.pixi/bin/pixi workspace platform add linux-aarch64 &&
+        /root/.pixi/bin/pixi add jupyter notebook &&
+        /root/.pixi/bin/pixi run jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password=''
+      "
+    restart: unless-stopped
+    depends_on:
+      - vinatools
--- a/docker/docker.env.example
+++ b/docker/docker.env.example
@@ -0,0 +1,12 @@
+# AutoDock Vina 配置
+VINA_VERSION=1.2.7
+VINA_PLATFORM=mac_aarch64
+DOWNLOAD_VINA=true
+
+# 其他平台选项：
+# VINA_PLATFORM=linux_x86_64
+# VINA_PLATFORM=mac_x86_64
+# VINA_PLATFORM=windows_x86_64
+
+# 禁用 AutoDock Vina 下载
+# DOWNLOAD_VINA=false
--- a/pixi.lock
+++ b/pixi.lock
--- a/pixi.toml
+++ b/pixi.toml
@@ -0,0 +1,15 @@
+[workspace]
+authors = ["hotwa <pylyzeng@gmail.com>"]
+channels = ["conda-forge"]
+name = "vinatools"
+platforms = ["osx-arm64", "linux-aarch64"]
+version = "0.1.0"
+
+[tasks]
+
+[dependencies]
+rdkit = ">=2025.9.1,<2026"
+openbabel = ">=3.1.1,<4"
+meeko = ">=0.5.0,<0.6"
+jupyter = ">=1.1.1,<2"
+notebook = ">=7.4.7,<8"
--- a/scripts/analyze_qed_mw_distribution.py
+++ b/scripts/analyze_qed_mw_distribution.py
@@ -0,0 +1,551 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :analyze_qed_mw_distribution.py
+@Description        :Analysis of QED and molecular weight distribution with KDE plots
+@Date               :2025/08/05
+@Author             :lyzeng
+'''
+
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+from rdkit import Chem
+import logging
+import ast
+import json
+import click
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def load_dataset(csv_file):
+    """
+    Load dataset from CSV file
+    
+    Args:
+        csv_file (str): Path to the CSV file
+        
+    Returns:
+        pd.DataFrame: Loaded dataset
+    """
+    df = pd.read_csv(csv_file)
+    logger.info(f"Loaded {len(df)} records from {csv_file}")
+    
+    # Print basic statistics
+    logger.info(f"Statistics for {Path(csv_file).stem}:")
+    logger.info(f"QED - Min: {df['qed'].min():.3f}, Max: {df['qed'].max():.3f}, Mean: {df['qed'].mean():.3f}")
+    logger.info(f"Molecular Weight - Min: {df['molecular_weight'].min():.2f}, Max: {df['molecular_weight'].max():.2f}, Mean: {df['molecular_weight'].mean():.2f}")
+    
+    return df
+
+def load_reference_molecules(dataset_name):
+    """
+    Load reference molecules from CSV file
+    
+    Args:
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        
+    Returns:
+        pd.DataFrame: Reference molecules with QED and molecular weight
+    """
+    # Load reference molecules from the main CSV file
+    csv_files = list(Path(".").glob(f"qed_values_{dataset_name}.csv"))
+    if not csv_files:
+        logger.warning(f"No CSV file found for {dataset_name}")
+        return pd.DataFrame()
+    
+    df = pd.read_csv(csv_files[0])
+    
+    # Filter for reference molecules (those that have align_ and _out_converted.sdf in their filename)
+    reference_df = df[df['filename'].str.contains('align_.*_out_converted\.sdf', na=False, regex=True)]
+    
+    logger.info(f"Loaded {len(reference_df)} reference molecules for {dataset_name}")
+    return reference_df
+
+def extract_vina_scores_from_sdf(sdf_file_path):
+    """
+    Extract Vina scores from all conformers in an SDF file
+    
+    Args:
+        sdf_file_path (str): Path to the SDF file
+        
+    Returns:
+        list: List of Vina scores (free_energy values) or empty list if failed
+    """
+    scores = []
+    try:
+        supplier = Chem.SDMolSupplier(sdf_file_path, removeHs=False)
+        for mol in supplier:
+            if mol is None:
+                continue
+                
+            # Get the meeko property which contains docking information
+            if mol.HasProp("meeko"):
+                meeko_raw = mol.GetProp("meeko")
+                try:
+                    meeko_dict = json.loads(meeko_raw)
+                    # Extract free energy (Vina score)
+                    if 'free_energy' in meeko_dict:
+                        scores.append(meeko_dict['free_energy'])
+                except json.JSONDecodeError:
+                    logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}")
+            else:
+                logger.warning(f"No meeko property found in molecule from {sdf_file_path}")
+    except Exception as e:
+        logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}")
+    
+    return scores
+
+def load_vina_scores_from_csv(df, max_files=1000):
+    """
+    Load Vina scores from the CSV file
+    
+    Args:
+        df (pd.DataFrame): DataFrame with vina_scores column
+        max_files (int): Maximum number of files to process
+        
+    Returns:
+        list: List of all Vina scores from all molecules
+    """
+    all_vina_scores = []
+    
+    # Process only up to max_files to avoid memory issues
+    processed_files = 0
+    
+    for idx, row in df.iterrows():
+        if processed_files >= max_files:
+            break
+            
+        # Skip reference molecules (those with mol2 extension)
+        if '.mol2' in row['filename']:
+            continue
+            
+        try:
+            # Parse the vina_scores string back to a list
+            vina_scores = ast.literal_eval(row['vina_scores'])
+            all_vina_scores.extend(vina_scores)
+            processed_files += 1
+        except (ValueError, SyntaxError) as e:
+            logger.warning(f"Failed to parse Vina scores for {row['filename']}: {e}")
+    
+    logger.info(f"Loaded {len(all_vina_scores)} Vina scores from {processed_files} files")
+    return all_vina_scores
+
+def get_min_vina_scores_length(df):
+    """
+    Get the minimum length of vina_scores lists in the dataframe
+    
+    Args:
+        df (pd.DataFrame): DataFrame with vina_scores column
+        
+    Returns:
+        int: Minimum length of vina_scores lists
+    """
+    min_length = float('inf')
+    
+    for idx, row in df.iterrows():
+        # Skip reference molecules (those with mol2 extension)
+        if '.mol2' in row['filename']:
+            continue
+            
+        try:
+            # Parse the vina_scores string back to a list
+            vina_scores = ast.literal_eval(row['vina_scores'])
+            min_length = min(min_length, len(vina_scores))
+        except (ValueError, SyntaxError) as e:
+            logger.warning(f"Failed to parse Vina scores for {row['filename']}: {e}")
+    
+    return min_length if min_length != float('inf') else 0
+
+def get_reference_vina_scores(dataset_name, rank=0):
+    """
+    Get Vina scores for reference molecules
+    
+    Args:
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        rank (int): Rank of the conformation to use (0 for best/first)
+        
+    Returns:
+        dict: Dictionary with reference molecule identifiers and their Vina scores
+    """
+    reference_scores = {}
+    
+    # 使用原始目录名称 "refence"
+    reference_dir = Path("result") / "refence" / dataset_name
+    
+    if not reference_dir.exists():
+        logger.warning(f"Reference directory {reference_dir} does not exist")
+        return reference_scores
+    
+    # Find reference SDF files
+    reference_sdf_files = list(reference_dir.glob("*_converted.sdf"))
+    logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}")
+    
+    for sdf_file in reference_sdf_files:
+        vina_scores = extract_vina_scores_from_sdf(str(sdf_file))
+        if vina_scores:
+            # Check if rank is valid
+            if rank >= len(vina_scores):
+                raise ValueError(f"Rank {rank} is out of range. The minimum number of conformers across all molecules is {len(vina_scores)}. Please choose a rank less than {len(vina_scores)}.")
+            
+            # Get the score at the specified rank
+            reference_score = vina_scores[rank]
+            
+            # Extract identifier from filename
+            filename_stem = sdf_file.stem
+            if '_out_converted' in filename_stem:
+                filename_stem = filename_stem.replace('_out_converted', '')
+            if '_addH' in filename_stem:
+                filename_stem = filename_stem.replace('_addH', '')
+            if 'align_' in filename_stem:
+                filename_stem = filename_stem.split('_')[-1]  # Get the last part (e.g., 9NY or 0GA)
+            
+            # Use filename_stem as key for reference_scores
+            reference_scores[filename_stem] = reference_score
+            logger.info(f"Reference Vina score for {filename_stem} (rank {rank}): {reference_score}")
+    
+    return reference_scores
+
+def plot_combined_kde_distribution_normalized(df, dataset_name, reference_df=None, reference_scores=None, vina_scores=None, reference_vina_scores=None):
+    """
+    Plot combined KDE distribution for QED, molecular weight, and Vina scores (normalized)
+    
+    Args:
+        df (pd.DataFrame): Main dataset
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        reference_df (pd.DataFrame): Reference molecules dataset (optional)
+        reference_scores (dict): Reference molecule scores (optional)
+        vina_scores (list): Vina scores for all molecules (optional)
+        reference_vina_scores (dict): Reference molecule Vina scores (optional)
+    """
+    # Create figure
+    plt.figure(figsize=(15, 8))
+    
+    # Normalize the data to make them comparable on the same scale
+    qed_normalized = (df['qed'] - df['qed'].min()) / (df['qed'].max() - df['qed'].min())
+    mw_normalized = (df['molecular_weight'] - df['molecular_weight'].min()) / (df['molecular_weight'].max() - df['molecular_weight'].min())
+    
+    # Plot KDE for normalized QED
+    sns.kdeplot(qed_normalized, label='QED (normalized)', fill=True, alpha=0.5, color='blue')
+    
+    # Plot KDE for normalized molecular weight
+    sns.kdeplot(mw_normalized, label='Molecular Weight (normalized)', fill=True, alpha=0.5, color='red')
+    
+    # Plot KDE for normalized Vina scores if available
+    if vina_scores and len(vina_scores) > 0:
+        # Normalize Vina scores (note: lower scores are better, so we negate for visualization)
+        vina_series = pd.Series(vina_scores)
+        vina_normalized = (vina_series - vina_series.min()) / (vina_series.max() - vina_series.min())
+        sns.kdeplot(vina_normalized, label='Vina Score (normalized)', fill=True, alpha=0.5, color='green')
+    
+    # Mark reference molecules if provided
+    if reference_df is not None and len(reference_df) > 0:
+        # Normalize reference data using the same scale as main dataset
+        ref_qed_normalized = (reference_df['qed'] - df['qed'].min()) / (df['qed'].max() - df['qed'].min())
+        ref_mw_normalized = (reference_df['molecular_weight'] - df['molecular_weight'].min()) / (df['molecular_weight'].max() - df['molecular_weight'].min())
+        
+        # Dictionary to store reference positions for legend
+        legend_handles = []
+        
+        # Mark reference molecules for QED, MW, and Vina scores
+        for i, (idx, row) in enumerate(reference_df.iterrows()):
+            filename_stem = Path(row['filename']).stem
+            # Extract actual identifier from filename
+            if '_addH' in filename_stem:
+                filename_stem = filename_stem.replace('_addH', '')
+            if 'align_' in filename_stem:
+                filename_stem = filename_stem.split('_')[-1]  # Get the last part (e.g., 9NY or 0GA)
+                
+            # Get values from the reference dataframe
+            qed_value = row['qed']
+            mw_value = row['molecular_weight']
+            
+            # Build score text
+            score_text = f"{filename_stem}\n(QED: {qed_value:.2f}, MW: {mw_value:.2f}"
+            
+            # Add Vina score if available
+            if reference_vina_scores and filename_stem in reference_vina_scores:
+                vina_score = reference_vina_scores[filename_stem]
+                score_text += f", Vina: {vina_score:.2f}"
+            score_text += ")"
+                
+            # QED marker
+            x_pos = ref_qed_normalized.iloc[i]
+            plt.scatter(x_pos, 0, color='darkblue', s=100, marker='v', zorder=5)
+            
+            # Molecular weight marker
+            x_pos = ref_mw_normalized.iloc[i]
+            plt.scatter(x_pos, 0, color='darkred', s=100, marker='^', zorder=5)
+            
+            # Vina score marker if available
+            if reference_vina_scores and filename_stem in reference_vina_scores:
+                # Normalize reference Vina score
+                vina_min = min(vina_scores)
+                vina_max = max(vina_scores)
+                ref_vina_normalized = (reference_vina_scores[filename_stem] - vina_min) / (vina_max - vina_min)
+                plt.scatter(ref_vina_normalized, 0, color='darkgreen', s=100, marker='o', zorder=5)
+            
+            # Annotate with combined information
+            plt.annotate(score_text, 
+                        (x_pos, 0), 
+                        xytext=(10, 30), 
+                        textcoords='offset points', 
+                        bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7),
+                        arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'),
+                        fontsize=8)
+            
+            # Add to legend
+            legend_handles.append(plt.Line2D([0], [0], marker='v', color='darkblue', label=f"{filename_stem} - QED",
+                                          markerfacecolor='darkblue', markersize=8, linestyle=''))
+            legend_handles.append(plt.Line2D([0], [0], marker='^', color='darkred', label=f"{filename_stem} - MW",
+                                          markerfacecolor='darkred', markersize=8, linestyle=''))
+            if reference_vina_scores and filename_stem in reference_vina_scores:
+                legend_handles.append(plt.Line2D([0], [0], marker='o', color='darkgreen', label=f"{filename_stem} - Vina",
+                                          markerfacecolor='darkgreen', markersize=8, linestyle=''))
+        
+        # Add combined legend
+        plt.legend(handles=legend_handles, loc='upper right', fontsize=10)
+    
+    plt.title(f'Combined KDE Distribution (Normalized) - {dataset_name.upper()}', fontsize=16)
+    plt.xlabel('Normalized Values (0-1)')
+    plt.ylabel('Density')
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    
+    # Adjust layout and save figure
+    plt.tight_layout()
+    plt.savefig(f'kde_distribution_{dataset_name}_normalized.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    logger.info(f"Saved combined KDE distribution plot (normalized) for {dataset_name} as kde_distribution_{dataset_name}_normalized.png")
+
+def plot_combined_kde_distribution_actual(df, dataset_name, reference_df=None, reference_scores=None, vina_scores=None, reference_vina_scores=None):
+    """
+    Plot combined KDE distribution for QED, molecular weight, and Vina scores (actual values)
+    
+    Args:
+        df (pd.DataFrame): Main dataset
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        reference_df (pd.DataFrame): Reference molecules dataset (optional)
+        reference_scores (dict): Reference molecule scores (optional)
+        vina_scores (list): Vina scores for all molecules (optional)
+        reference_vina_scores (dict): Reference molecule Vina scores (optional)
+    """
+    # Create figure with subplots
+    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
+    fig.suptitle(f'KDE Distribution (Actual Values) - {dataset_name.upper()}', fontsize=16)
+    
+    # Get reference molecule identifier (stem of the SDF filename)
+    reference_filename_stem = None
+    if reference_df is not None and len(reference_df) > 0:
+        reference_filename_stem = Path(reference_df.iloc[0]['filename']).stem
+        if '_out_converted' in reference_filename_stem:
+            reference_filename_stem = reference_filename_stem.replace('_out_converted', '')
+        if '_addH' in reference_filename_stem:
+            reference_filename_stem = reference_filename_stem.replace('_addH', '')
+        if 'align_' in reference_filename_stem:
+            reference_filename_stem = reference_filename_stem.split('_')[-1]  # Get the last part (e.g., 9NY or 0GA)
+    
+    # Plot 1: QED distribution
+    sns.kdeplot(df['qed'], ax=axes[0], fill=True, alpha=0.5, color='blue')
+    axes[0].set_title('QED Distribution')
+    axes[0].set_xlabel('QED Value')
+    axes[0].set_ylabel('Density')
+    axes[0].grid(True, alpha=0.3)
+    
+    # Mark reference molecules for QED
+    if reference_df is not None and len(reference_df) > 0:
+        for i, (idx, row) in enumerate(reference_df.iterrows()):
+            filename_stem = Path(row['filename']).stem
+            # Extract actual identifier from filename
+            if '_addH' in filename_stem:
+                filename_stem = filename_stem.replace('_addH', '')
+            if 'align_' in filename_stem:
+                filename_stem = filename_stem.split('_')[-1]  # Get the last part (e.g., 9NY or 0GA)
+                
+            # Get QED value from the reference dataframe
+            qed_value = row['qed']
+            score_text = f"{reference_filename_stem}\n({qed_value:.2f})"
+                
+            axes[0].scatter(row['qed'], 0, color='darkblue', s=100, marker='v', zorder=5)
+            axes[0].annotate(score_text, 
+                           (row['qed'], 0), 
+                           xytext=(10, 20), 
+                           textcoords='offset points', 
+                           bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7),
+                           arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'),
+                           fontsize=8)
+    
+    # Plot 2: Molecular weight distribution
+    sns.kdeplot(df['molecular_weight'], ax=axes[1], fill=True, alpha=0.5, color='red')
+    axes[1].set_title('Molecular Weight Distribution')
+    axes[1].set_xlabel('Molecular Weight (Daltons)')
+    axes[1].set_ylabel('Density')
+    axes[1].grid(True, alpha=0.3)
+    
+    # Mark reference molecules for molecular weight
+    if reference_df is not None and len(reference_df) > 0:
+        for i, (idx, row) in enumerate(reference_df.iterrows()):
+            filename_stem = Path(row['filename']).stem
+            # Extract actual identifier from filename
+            if '_addH' in filename_stem:
+                filename_stem = filename_stem.replace('_addH', '')
+            if 'align_' in filename_stem:
+                filename_stem = filename_stem.split('_')[-1]  # Get the last part (e.g., 9NY or 0GA)
+                
+            # Get MW value from the reference dataframe
+            mw_value = row['molecular_weight']
+            score_text = f"{reference_filename_stem}\n({mw_value:.2f})"
+                
+            axes[1].scatter(row['molecular_weight'], 0, color='darkred', s=100, marker='^', zorder=5)
+            axes[1].annotate(score_text, 
+                           (row['molecular_weight'], 0), 
+                           xytext=(10, -30), 
+                           textcoords='offset points', 
+                           bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7),
+                           arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'),
+                           fontsize=8)
+    
+    # Plot 3: Vina scores distribution
+    if vina_scores and len(vina_scores) > 0:
+        vina_series = pd.Series(vina_scores)
+        sns.kdeplot(vina_series, ax=axes[2], fill=True, alpha=0.5, color='green')
+        axes[2].set_title('Vina Score Distribution')
+        axes[2].set_xlabel('Vina Score (kcal/mol)')
+        axes[2].set_ylabel('Density')
+        axes[2].grid(True, alpha=0.3)
+        
+        # Mark reference molecules for Vina scores
+        if reference_vina_scores:
+            for filename_stem, vina_score in reference_vina_scores.items():
+                score_text = f"{reference_filename_stem}\n({vina_score:.2f})"
+                
+                axes[2].scatter(vina_score, 0, color='darkgreen', s=100, marker='o', zorder=5)
+                axes[2].annotate(score_text, 
+                               (vina_score, 0), 
+                               xytext=(10, -60), 
+                               textcoords='offset points', 
+                               bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7),
+                               arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'),
+                               fontsize=8)
+    
+    # Adjust layout and save figure
+    plt.tight_layout()
+    plt.savefig(f'kde_distribution_{dataset_name}_actual.png', dpi=300, bbox_inches='tight')
+    plt.close()
+    logger.info(f"Saved combined KDE distribution plot (actual values) for {dataset_name} as kde_distribution_{dataset_name}_actual.png")
+
+def analyze_dataset(csv_file, dataset_name, reference_scores=None, rank=0):
+    """
+    Analyze a dataset and generate KDE plots
+    
+    Args:
+        csv_file (str): Path to the CSV file
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        reference_scores (dict): Reference scores for each dataset
+        rank (int): Rank of the conformation to use for reference Vina scores (0 for best/first)
+    """
+    # Load main dataset
+    df = load_dataset(csv_file)
+    
+    # Check minimum vina scores length
+    min_vina_length = get_min_vina_scores_length(df)
+    if rank >= min_vina_length:
+        raise ValueError(f"Rank {rank} is out of range. The minimum number of conformers across all molecules is {min_vina_length}. Please choose a rank less than {min_vina_length}.")
+    
+    # Load reference molecules
+    reference_df = load_reference_molecules(dataset_name)
+    
+    # Load Vina scores from CSV
+    vina_scores = load_vina_scores_from_csv(df)
+    
+    # Get reference Vina scores
+    reference_vina_scores = get_reference_vina_scores(dataset_name, rank)
+    
+    # Plot combined KDE distributions (normalized)
+    plot_combined_kde_distribution_normalized(df, dataset_name, reference_df, reference_scores, vina_scores, reference_vina_scores)
+    
+    # Plot combined KDE distributions (actual values)
+    plot_combined_kde_distribution_actual(df, dataset_name, reference_df, reference_scores, vina_scores, reference_vina_scores)
+
+def get_default_reference_scores():
+    """
+    Get default reference scores from README.md
+    
+    Returns:
+        dict: Dictionary with default reference scores
+    """
+    # Default reference scores from README.md
+    return {
+        'fgbar': {
+            '9NY': -5.268  # From README.md
+        },
+        'trpe': {
+            '0GA': -6.531  # From README.md
+        }
+    }
+
+@click.command()
+@click.argument('csv_files', type=click.Path(exists=True), nargs=-1)
+@click.option('--dataset-names', '-d', multiple=True, help='Names of the datasets corresponding to CSV files')
+@click.option('--reference-scores', '-r', type=str, help='Reference scores in JSON format')
+@click.option('--rank', '-k', default=0, type=int, help='Rank of conformation to use for reference Vina scores (default: 0 for best/first)')
+def main_cli(csv_files, dataset_names, reference_scores, rank):
+    """
+    Analyze QED and molecular weight distributions and generate KDE plots
+    
+    CSV_FILES: Paths to the CSV files with QED and molecular weight data
+    """
+    if not csv_files:
+        logger.error("At least one CSV file must be provided")
+        return
+    
+    # Convert dataset names to list
+    dataset_names_list = list(dataset_names) if dataset_names else None
+    
+    # Parse reference scores if provided
+    if reference_scores:
+        try:
+            reference_scores_dict = json.loads(reference_scores)
+        except json.JSONDecodeError:
+            logger.error("Invalid JSON format for reference scores")
+            return
+    else:
+        reference_scores_dict = get_default_reference_scores()
+    
+    # Run main analysis
+    try:
+        main_api(csv_files, dataset_names_list, reference_scores_dict, rank)
+    except Exception as e:
+        logger.error(f"Analysis failed: {e}")
+        raise
+
+def main_api(csv_files, dataset_names=None, reference_scores=None, rank=0):
+    """
+    Main function for API usage
+    
+    Args:
+        csv_files (list): List of CSV files to analyze
+        dataset_names (list): List of dataset names corresponding to CSV files
+        reference_scores (dict): Reference scores for each dataset
+        rank (int): Rank of the conformation to use for reference Vina scores (0 for best/first)
+    """
+    if dataset_names is None:
+        dataset_names = [Path(f).stem.replace('qed_values_', '') for f in csv_files]
+    
+    if reference_scores is None:
+        reference_scores = get_default_reference_scores()
+    
+    for csv_file, dataset_name in zip(csv_files, dataset_names):
+        try:
+            logger.info(f"Analyzing dataset: {dataset_name}")
+            analyze_dataset(csv_file, dataset_name, reference_scores.get(dataset_name, {}), rank)
+        except Exception as e:
+            logger.error(f"Error analyzing {dataset_name}: {e}")
+            raise
+
+if __name__ == "__main__":
+    main_cli()
--- a/scripts/analyze_results.py
+++ b/scripts/analyze_results.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3  
+"""  
+分析AutoDock Vina对接结果脚本  
+用法: python analyze_results.py poses_directory output_csv  
+"""  
+  
+import os  
+import sys  
+import pandas as pd  
+import re  
+from pathlib import Path  
+  
+def parse_vina_output(pdbqt_file):  
+    """解析Vina输出文件，提取能量信息"""  
+    results = []  
+      
+    try:  
+        with open(pdbqt_file, 'r') as f:  
+            lines = f.readlines()  
+          
+        ligand_name = Path(pdbqt_file).stem.replace('_out', '')  
+        model_num = 0  
+          
+        for line in lines:  
+            if line.startswith('REMARK VINA RESULT:'):  
+                # 解析能量信息  
+                # 格式: REMARK VINA RESULT: -8.5 0.000 0.000  
+                parts = line.strip().split()  
+                if len(parts) >= 4:  
+                    binding_energy = float(parts[3])  
+                    rmsd_lb = float(parts[4]) if len(parts) > 4 else 0.0  
+                    rmsd_ub = float(parts[5]) if len(parts) > 5 else 0.0  
+                      
+                    model_num += 1  
+                    results.append({  
+                        'ligand_name': ligand_name,  
+                        'model': model_num,  
+                        'binding_energy': binding_energy,  
+                        'rmsd_lb': rmsd_lb,  
+                        'rmsd_ub': rmsd_ub,  
+                        'file_path': pdbqt_file  
+                    })  
+      
+    except Exception as e:  
+        print(f"解析文件失败 {pdbqt_file}: {e}")  
+      
+    return results  
+  
+def analyze_poses_directory(poses_dir):  
+    """分析整个poses目录"""  
+    all_results = []  
+      
+    poses_path = Path(poses_dir)  
+    if not poses_path.exists():  
+        print(f"错误: 目录不存在 {poses_dir}")  
+        return []  
+      
+    # 查找所有_out.pdbqt文件  
+    pdbqt_files = list(poses_path.glob("*_out.pdbqt"))  
+      
+    print(f"找到 {len(pdbqt_files)} 个结果文件")  
+      
+    for pdbqt_file in pdbqt_files:  
+        results = parse_vina_output(str(pdbqt_file))  
+        all_results.extend(results)  
+      
+    return all_results  
+  
+def generate_summary_stats(df):  
+    """生成汇总统计信息"""  
+    if df.empty:  
+        return {}  
+      
+    # 获取每个配体的最佳结合能  
+    best_poses = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()]  
+      
+    stats = {  
+        'total_ligands': df['ligand_name'].nunique(),  
+        'total_poses': len(df),  
+        'best_binding_energy': df['binding_energy'].min(),  
+        'worst_binding_energy': df['binding_energy'].max(),  
+        'mean_binding_energy': df['binding_energy'].mean(),  
+        'median_binding_energy': df['binding_energy'].median(),  
+        'std_binding_energy': df['binding_energy'].std(),  
+        'ligands_better_than_minus_7': (best_poses['binding_energy'] < -7.0).sum(),  
+        'ligands_better_than_minus_8': (best_poses['binding_energy'] < -8.0).sum(),  
+        'ligands_better_than_minus_9': (best_poses['binding_energy'] < -9.0).sum(),  
+    }  
+      
+    return stats  
+  
+def main():  
+    if len(sys.argv) != 3:  
+        print("用法: python analyze_results.py <poses目录> <输出CSV文件>")  
+        sys.exit(1)  
+      
+    poses_dir = sys.argv[1]  
+    output_csv = sys.argv[2]  
+      
+    print("开始分析对接结果...")  
+      
+    # 分析所有结果  
+    results = analyze_poses_directory(poses_dir)  
+      
+    if not results:  
+        print("没有找到有效的结果文件")  
+        sys.exit(1)  
+      
+    # 转换为DataFrame  
+    df = pd.DataFrame(results)  
+      
+    # 保存详细结果  
+    df.to_csv(output_csv, index=False)  
+    print(f"详细结果已保存到: {output_csv}")  
+      
+    # 生成汇总统计  
+    stats = generate_summary_stats(df)  
+      
+    # 打印汇总信息  
+    print("\n=== 对接结果汇总 ===")  
+    print(f"总配体数量: {stats['total_ligands']}")  
+    print(f"总构象数量: {stats['total_poses']}")  
+    print(f"最佳结合能: {stats['best_binding_energy']:.2f} kcal/mol")  
+    print(f"最差结合能: {stats['worst_binding_energy']:.2f} kcal/mol")  
+    print(f"平均结合能: {stats['mean_binding_energy']:.2f} kcal/mol")  
+    print(f"中位结合能: {stats['median_binding_energy']:.2f} kcal/mol")  
+    print(f"标准差: {stats['std_binding_energy']:.2f} kcal/mol")  
+    print(f"结合能 < -7.0 的配体: {stats['ligands_better_than_minus_7']}")  
+    print(f"结合能 < -8.0 的配体: {stats['ligands_better_than_minus_8']}")  
+    print(f"结合能 < -9.0 的配体: {stats['ligands_better_than_minus_9']}")  
+      
+    # 显示前10个最佳结果  
+    best_results = df.loc[df.groupby('ligand_name')['binding_energy'].idxmin()]  
+    best_results = best_results.sort_values('binding_energy').head(10)  
+      
+    print("\n=== 前10个最佳结果 ===")  
+    for _, row in best_results.iterrows():  
+        print(f"{row['ligand_name']}: {row['binding_energy']:.2f} kcal/mol")  
+      
+    # 保存汇总统计  
+    summary_file = output_csv.replace('.csv', '_summary.txt')  
+    with open(summary_file, 'w') as f:  
+        f.write("AutoDock Vina 对接结果汇总\n")  
+        f.write("=" * 30 + "\n\n")  
+        for key, value in stats.items():  
+            f.write(f"{key}: {value}\n")  
+      
+    print(f"\n汇总统计已保存到: {summary_file}")  
+  
+if __name__ == "__main__":  
+    main()
--- a/scripts/calculate_qed_values.py
+++ b/scripts/calculate_qed_values.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :calculate_qed_values.py
+@Description        :Calculate QED values for molecules in poses_all directories and reference molecules
+@Date               :2025/08/04
+@Author             :lyzeng
+'''
+
+import pandas as pd
+from rdkit import Chem
+from rdkit.Chem import QED
+from rdkit.Chem.Descriptors import MolWt
+from pathlib import Path
+import logging
+import json
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def get_smiles_from_sdf(sdf_file_path):
+    """
+    Extract SMILES from the first molecule in an SDF file
+    
+    Args:
+        sdf_file_path (Path): Path to the SDF file
+        
+    Returns:
+        str: SMILES representation of the first molecule or None if failed
+    """
+    try:
+        supplier = Chem.SDMolSupplier(str(sdf_file_path))
+        mol = next(supplier)  # Get the first molecule
+        if mol is not None:
+            smiles = Chem.MolToSmiles(mol)
+            return smiles
+    except Exception as e:
+        logger.warning(f"Failed to extract SMILES from {sdf_file_path}: {e}")
+    return None
+
+def get_smiles_from_mol2(mol2_file_path):
+    """
+    Extract SMILES from a mol2 file
+    
+    Args:
+        mol2_file_path (Path): Path to the mol2 file
+        
+    Returns:
+        str: SMILES representation of the molecule or None if failed
+    """
+    try:
+        mol = Chem.MolFromMol2File(str(mol2_file_path))
+        if mol is not None:
+            smiles = Chem.MolToSmiles(mol)
+            return smiles
+    except Exception as e:
+        logger.warning(f"Failed to extract SMILES from {mol2_file_path}: {e}")
+    return None
+
+def extract_vina_scores_from_sdf(sdf_file_path):
+    """
+    Extract Vina scores from all conformers in an SDF file
+    
+    Args:
+        sdf_file_path (Path): Path to the SDF file
+        
+    Returns:
+        list: List of Vina scores (free_energy values) or empty list if failed
+    """
+    scores = []
+    try:
+        supplier = Chem.SDMolSupplier(str(sdf_file_path), removeHs=False)
+        for mol in supplier:
+            if mol is None:
+                continue
+                
+            # Get the meeko property which contains docking information
+            if mol.HasProp("meeko"):
+                meeko_raw = mol.GetProp("meeko")
+                try:
+                    meeko_dict = json.loads(meeko_raw)
+                    # Extract free energy (Vina score)
+                    if 'free_energy' in meeko_dict:
+                        scores.append(meeko_dict['free_energy'])
+                except json.JSONDecodeError:
+                    logger.warning(f"Failed to parse meeko JSON for {sdf_file_path}")
+            else:
+                logger.warning(f"No meeko property found in molecule from {sdf_file_path}")
+    except Exception as e:
+        logger.warning(f"Failed to extract Vina scores from {sdf_file_path}: {e}")
+    
+    return scores
+
+def calculate_qed_for_poses_all(base_dir, dataset_name):
+    """
+    Calculate QED values for all SDF files in poses_all directory
+    
+    Args:
+        base_dir (Path): Base directory containing the poses_all folder
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        
+    Returns:
+        list: List of dictionaries with smiles, filename, qed, and molecular weight values
+    """
+    results = []
+    poses_all_dir = base_dir / dataset_name / "poses_all"
+    
+    if not poses_all_dir.exists():
+        logger.warning(f"Directory {poses_all_dir} does not exist")
+        return results
+    
+    sdf_files = list(poses_all_dir.glob("*.sdf"))
+    logger.info(f"Processing {len(sdf_files)} SDF files in {poses_all_dir}")
+    
+    for sdf_file in sdf_files:
+        smiles = get_smiles_from_sdf(sdf_file)
+        vina_scores = extract_vina_scores_from_sdf(sdf_file)
+        
+        if smiles is not None:
+            try:
+                mol = Chem.MolFromSmiles(smiles)
+                if mol is not None:
+                    qed_value = QED.qed(mol)
+                    mol_weight = MolWt(mol)
+                    results.append({
+                        'smiles': smiles,
+                        'filename': sdf_file.name,
+                        'qed': qed_value,
+                        'molecular_weight': mol_weight,
+                        'vina_scores': vina_scores  # 添加Vina得分列表
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
+    
+    return results
+
+def calculate_qed_for_reference(base_dir, dataset_name):
+    """
+    Calculate QED values for reference molecules in SDF format
+    
+    Args:
+        base_dir (Path): Base directory containing the reference folder
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+        
+    Returns:
+        list: List of dictionaries with smiles, filename, qed, molecular weight, and vina scores values
+    """
+    results = []
+    # 使用原始目录名称 "refence"
+    reference_dir = base_dir / "refence" / dataset_name
+    
+    if not reference_dir.exists():
+        logger.warning(f"Directory {reference_dir} does not exist")
+        return results
+    
+    # 查找参考分子的SDF文件
+    reference_sdf_files = list(reference_dir.glob("*_out_converted.sdf"))
+    logger.info(f"Processing {len(reference_sdf_files)} reference SDF files in {reference_dir}")
+    
+    for sdf_file in reference_sdf_files:
+        smiles = get_smiles_from_sdf(sdf_file)
+        vina_scores = extract_vina_scores_from_sdf(sdf_file)
+        
+        if smiles is not None:
+            try:
+                mol = Chem.MolFromSmiles(smiles)
+                if mol is not None:
+                    qed_value = QED.qed(mol)
+                    mol_weight = MolWt(mol)
+                    results.append({
+                        'smiles': smiles,
+                        'filename': sdf_file.name,
+                        'qed': qed_value,
+                        'molecular_weight': mol_weight,
+                        'vina_scores': str(vina_scores)  # 添加Vina得分列表
+                    })
+            except Exception as e:
+                logger.warning(f"Failed to calculate QED for {sdf_file}: {e}")
+    
+    return results
+
+def process_dataset(result_dir, dataset_name):
+    """
+    Process a single dataset (fgbar or trpe) and save to a separate CSV file
+    
+    Args:
+        result_dir (Path): Base result directory
+        dataset_name (str): Name of the dataset (fgbar or trpe)
+    """
+    # Process poses_all SDF files
+    poses_results = calculate_qed_for_poses_all(result_dir, dataset_name)
+    
+    # Process reference SDF files
+    reference_results = calculate_qed_for_reference(result_dir, dataset_name)
+    
+    # Combine results
+    all_results = poses_results + reference_results
+    
+    # Create DataFrame and save to CSV
+    if all_results:
+        df = pd.DataFrame(all_results)
+        csv_filename = f"qed_values_{dataset_name}.csv"
+        df.to_csv(csv_filename, index=False)
+        logger.info(f"Saved {len(df)} QED values to {csv_filename}")
+        print(f"First few rows of {csv_filename}:")
+        print(df.head())
+    else:
+        logger.warning(f"No QED values were calculated for {dataset_name}")
+        # Create empty CSV with headers
+        df = pd.DataFrame(columns=['smiles', 'filename', 'qed', 'molecular_weight', 'vina_scores'])
+        csv_filename = f"qed_values_{dataset_name}.csv"
+        df.to_csv(csv_filename, index=False)
+
+def main():
+    """
+    Main function to calculate QED values for all molecules
+    """
+    # Define base directories
+    result_dir = Path("result")
+    
+    # Process both datasets (fgbar and trpe) separately
+    datasets = ["fgbar", "trpe"]
+    for dataset in datasets:
+        process_dataset(result_dir, dataset)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/example_api_usage.py
+++ b/scripts/example_api_usage.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Example usage of the analyze_qed_mw_distribution API
+"""
+
+from analyze_qed_mw_distribution import main_api
+
+print("Running analysis examples...")
+
+# Example 1: Basic usage
+print("\nExample 1: Basic usage")
+main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'])
+
+# Example 2: With custom reference scores
+print("\nExample 2: With custom reference scores")
+main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'], 
+         reference_scores={'fgbar': {'9NY': -5.268}, 'trpe': {'0GA': -6.531}})
+
+# Example 3: With specific conformation rank
+print("\nExample 3: With specific conformation rank")
+main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'], rank=0)
+
+# Example 4: With both custom reference scores and specific conformation rank
+print("\nExample 4: With both custom reference scores and specific conformation rank")
+main_api(['qed_values_fgbar.csv', 'qed_values_trpe.csv'], ['fgbar', 'trpe'], 
+         reference_scores={'fgbar': {'9NY': -5.268}, 'trpe': {'0GA': -6.531}}, rank=0)
+
+print("\nAnalysis complete! Check the generated PNG files.")
--- a/scripts/qed.py
+++ b/scripts/qed.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@file               :qed_calculator.py
+@Description        :QED calculator with joblib parallel support
+@Date               :2025/08/04
+@Author             :lyzeng
+'''
+
+from rdkit import Chem
+from rdkit.Chem import QED
+import pandas as pd
+from typing import List, Union, Tuple, Optional
+import joblib
+from joblib import Parallel, delayed
+import logging
+
+# 设置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def calculate_single_qed(smiles: str) -> Optional[Tuple[str, float]]:
+    """
+    Calculate QED value for a single molecule.
+    
+    Args:
+        smiles (str): SMILES representation of the molecule
+        
+    Returns:
+        tuple: (smiles, qed_value) or None if calculation fails
+    """
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is not None:
+            qed_value = QED.qed(mol)
+            return (smiles, qed_value)
+    except Exception as e:
+        logger.warning(f"Failed to calculate QED for {smiles}: {e}")
+        pass
+    return None
+
+
+def parallel_qed_calculation(
+    smiles_list: List[str], 
+    n_jobs: int = -1,
+    batch_size: Union[int, str] = "auto",
+    backend: str = "loky"
+) -> pd.DataFrame:
+    """
+    Calculate QED values for a list of SMILES in parallel using joblib.
+    
+    Args:
+        smiles_list (List[str]): List of SMILES strings
+        n_jobs (int): Number of parallel jobs. -1 means using all processors
+        batch_size (int or str): Batch size for parallel processing
+        backend (str): Joblib backend to use ('loky', 'threading', 'multiprocessing')
+        
+    Returns:
+        pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
+    """
+    logger.info(f"Calculating QED values for {len(smiles_list)} molecules...")
+    
+    # 并行计算QED值
+    results = Parallel(
+        n_jobs=n_jobs,
+        batch_size=batch_size,
+        backend=backend
+    )(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
+    
+    # 过滤掉None结果
+    valid_results = [r for r in results if r is not None]
+    
+    if not valid_results:
+        logger.warning("No valid QED values calculated")
+        return pd.DataFrame(columns=['smiles', 'qed'])
+    
+    # 分离SMILES和QED值
+    smiles_values, qed_values = zip(*valid_results)
+    
+    # 创建DataFrame
+    df = pd.DataFrame({
+        'smiles': smiles_values,
+        'qed': qed_values
+    })
+    
+    logger.info(f"Successfully calculated QED values for {len(df)} molecules")
+    return df
+
+
+def calculate_qed_series(
+    smiles_series: Union[List[str], pd.Series],
+    n_jobs: int = -1,
+    batch_size: Union[int, str] = "auto",
+    backend: str = "loky"
+) -> pd.Series:
+    """
+    Calculate QED values for a pandas Series or list of SMILES and return as Series.
+    
+    Args:
+        smiles_series: Series or list of SMILES strings
+        n_jobs (int): Number of parallel jobs
+        batch_size (int or str): Batch size for parallel processing
+        backend (str): Joblib backend to use
+        
+    Returns:
+        pd.Series: Series of QED values with the same index as input (if Series)
+    """
+    if isinstance(smiles_series, pd.Series):
+        smiles_list = smiles_series.tolist()
+        original_index = smiles_series.index
+    else:
+        smiles_list = smiles_series
+        original_index = None
+    
+    # 计算QED值
+    results = Parallel(
+        n_jobs=n_jobs,
+        batch_size=batch_size,
+        backend=backend
+    )(delayed(calculate_single_qed)(smiles) for smiles in smiles_list)
+    
+    # 提取QED值（失败的计算返回None）
+    qed_values = [r[1] if r is not None else None for r in results]
+    
+    # 创建Series
+    if original_index is not None:
+        return pd.Series(qed_values, index=original_index, name='qed')
+    else:
+        return pd.Series(qed_values, name='qed')
+
+
+class QEDCalculator:
+    """
+    A class for calculating QED values with support for parallel processing and caching.
+    """
+    
+    def __init__(self, n_jobs: int = -1, batch_size: Union[int, str] = "auto"):
+        """
+        Initialize the QEDCalculator.
+        
+        Args:
+            n_jobs (int): Number of parallel jobs. -1 means using all processors
+            batch_size (int or str): Batch size for parallel processing
+        """
+        self.n_jobs = n_jobs
+        self.batch_size = batch_size
+        self.backend = "loky"
+        
+    def calculate(self, smiles_list: List[str]) -> pd.DataFrame:
+        """
+        Calculate QED values for a list of SMILES.
+        
+        Args:
+            smiles_list (List[str]): List of SMILES strings
+            
+        Returns:
+            pd.DataFrame: DataFrame with 'smiles' and 'qed' columns
+        """
+        return parallel_qed_calculation(
+            smiles_list, 
+            n_jobs=self.n_jobs, 
+            batch_size=self.batch_size,
+            backend=self.backend
+        )
+    
+    def calculate_series(self, smiles_series: Union[List[str], pd.Series]) -> pd.Series:
+        """
+        Calculate QED values for a pandas Series or list of SMILES and return as Series.
+        
+        Args:
+            smiles_series: Series or list of SMILES strings
+            
+        Returns:
+            pd.Series: Series of QED values
+        """
+        return calculate_qed_series(
+            smiles_series,
+            n_jobs=self.n_jobs,
+            batch_size=self.batch_size,
+            backend=self.backend
+        )
+    
+"""
+usage
+
+from utils.qed_calculator import parallel_qed_calculation, QEDCalculator
+
+# 方式1：直接使用函数
+smiles_list = ['CCO', 'CCN', 'CCC']
+qed_df = parallel_qed_calculation(smiles_list, n_jobs=-1)
+
+# 方式2：使用类
+calculator = QEDCalculator(n_jobs=-1)
+qed_df = calculator.calculate(smiles_list)
+
+# 方式3：处理pandas Series
+import pandas as pd
+smiles_series = pd.Series(['CCO', 'CCN', 'CCC'], name='smiles')
+qed_series = calculator.calculate_series(smiles_series)
+"""