497 lines
18 KiB
Python
Executable File
497 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
碎片分析脚本
|
||
分析指定环大小的分子碎片:
|
||
- 统计每个位置的碎片频率
|
||
- 分析碎片理化性质
|
||
- 找出裂解最多的位置(前5)
|
||
- 找出变化最大的位置
|
||
"""
|
||
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
# 添加项目根目录到 Python 路径
|
||
script_dir = Path(__file__).resolve().parent
|
||
project_root = script_dir.parent
|
||
sys.path.insert(0, str(project_root))
|
||
|
||
import json
|
||
import pandas as pd
|
||
import numpy as np
|
||
from collections import Counter, defaultdict
|
||
from rdkit import Chem
|
||
from rdkit.Chem import Descriptors, rdMolDescriptors
|
||
from rdkit import DataStructs
|
||
from rdkit.Chem import AllChem
|
||
import matplotlib.pyplot as plt
|
||
import seaborn as sns
|
||
from scipy import stats as scipy_stats
|
||
|
||
|
||
def load_fragments_from_ring_size(ring_size):
|
||
"""加载指定环大小的所有碎片"""
|
||
output_dir = Path(f'output/ring{ring_size}_fragments')
|
||
|
||
if not output_dir.exists():
|
||
print(f"❌ 未找到 {ring_size}元环的输出文件夹")
|
||
return None
|
||
|
||
all_fragments = []
|
||
|
||
for mol_dir in output_dir.iterdir():
|
||
if mol_dir.is_dir() and mol_dir.name.startswith(f'ring{ring_size}_mol_'):
|
||
fragments_file = mol_dir / f"{mol_dir.name}_all_fragments.json"
|
||
if fragments_file.exists():
|
||
with open(fragments_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
all_fragments.extend(data['fragments'])
|
||
|
||
return all_fragments
|
||
|
||
|
||
def calculate_fragment_properties(smiles):
|
||
"""计算碎片的理化性质"""
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
return None
|
||
|
||
properties = {
|
||
'smiles': smiles,
|
||
'molecular_weight': Descriptors.MolWt(mol),
|
||
'num_atoms': mol.GetNumAtoms(),
|
||
'num_heavy_atoms': mol.GetNumHeavyAtoms(),
|
||
'num_rotatable_bonds': Descriptors.NumRotatableBonds(mol),
|
||
'num_hbd': Descriptors.NumHDonors(mol), # 氢键供体
|
||
'num_hba': Descriptors.NumHAcceptors(mol), # 氢键受体
|
||
'logp': Descriptors.MolLogP(mol),
|
||
'tpsa': Descriptors.TPSA(mol), # 极性表面积
|
||
'num_rings': Descriptors.RingCount(mol),
|
||
'num_aromatic_rings': Descriptors.NumAromaticRings(mol),
|
||
}
|
||
|
||
return properties
|
||
|
||
|
||
def calculate_shannon_entropy(fragment_list):
|
||
"""
|
||
计算香农熵,用于衡量碎片多样性
|
||
熵越高,表示多样性越大
|
||
"""
|
||
if not fragment_list:
|
||
return 0
|
||
|
||
counts = Counter(fragment_list)
|
||
total = len(fragment_list)
|
||
|
||
entropy = 0
|
||
for count in counts.values():
|
||
p = count / total
|
||
if p > 0:
|
||
entropy -= p * np.log2(p)
|
||
|
||
return entropy
|
||
|
||
|
||
def calculate_structural_diversity(smiles_list):
|
||
"""
|
||
计算结构多样性
|
||
使用Tanimoto相似度的平均值
|
||
相似度越低,多样性越高
|
||
"""
|
||
if len(smiles_list) < 2:
|
||
return 0
|
||
|
||
# 生成指纹
|
||
fps = []
|
||
for smiles in smiles_list:
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol:
|
||
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
|
||
fps.append(fp)
|
||
|
||
if len(fps) < 2:
|
||
return 0
|
||
|
||
# 计算所有配对的Tanimoto相似度
|
||
similarities = []
|
||
for i in range(len(fps)):
|
||
for j in range(i+1, len(fps)):
|
||
sim = DataStructs.TanimotoSimilarity(fps[i], fps[j])
|
||
similarities.append(sim)
|
||
|
||
# 返回平均相似度(1-相似度 = 多样性)
|
||
avg_similarity = np.mean(similarities)
|
||
diversity = 1 - avg_similarity
|
||
|
||
return diversity
|
||
|
||
|
||
def analyze_position_statistics(fragments, ring_size):
|
||
"""分析每个位置的统计信息"""
|
||
|
||
# 按位置分组
|
||
position_data = defaultdict(list)
|
||
for frag in fragments:
|
||
pos = frag['cleavage_position']
|
||
position_data[pos].append(frag)
|
||
|
||
# 统计信息
|
||
results = []
|
||
|
||
for pos in sorted(position_data.keys()):
|
||
frags = position_data[pos]
|
||
smiles_list = [f['fragment_smiles'] for f in frags]
|
||
|
||
# 基础统计
|
||
total_count = len(smiles_list)
|
||
unique_count = len(set(smiles_list))
|
||
most_common = Counter(smiles_list).most_common(3)
|
||
|
||
# 多样性指标
|
||
shannon_entropy = calculate_shannon_entropy(smiles_list)
|
||
structural_diversity = calculate_structural_diversity(list(set(smiles_list)))
|
||
|
||
# 理化性质统计
|
||
props_list = []
|
||
for smiles in set(smiles_list):
|
||
props = calculate_fragment_properties(smiles)
|
||
if props:
|
||
props_list.append(props)
|
||
|
||
if props_list:
|
||
mw_mean = np.mean([p['molecular_weight'] for p in props_list])
|
||
mw_std = np.std([p['molecular_weight'] for p in props_list])
|
||
atom_mean = np.mean([p['num_atoms'] for p in props_list])
|
||
atom_std = np.std([p['num_atoms'] for p in props_list])
|
||
else:
|
||
mw_mean = mw_std = atom_mean = atom_std = 0
|
||
|
||
results.append({
|
||
'position': pos,
|
||
'total_count': total_count,
|
||
'unique_count': unique_count,
|
||
'uniqueness_ratio': unique_count / total_count if total_count > 0 else 0,
|
||
'shannon_entropy': shannon_entropy,
|
||
'structural_diversity': structural_diversity,
|
||
'mw_mean': mw_mean,
|
||
'mw_std': mw_std,
|
||
'atom_mean': atom_mean,
|
||
'atom_std': atom_std,
|
||
'most_common': most_common
|
||
})
|
||
|
||
return pd.DataFrame(results)
|
||
|
||
|
||
def plot_position_analysis(df_stats, ring_size, output_dir):
|
||
"""绘制位置分析图表"""
|
||
|
||
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
|
||
fig.suptitle(f'{ring_size}元环碎片位置分析', fontsize=16, fontweight='bold')
|
||
|
||
positions = df_stats['position'].values
|
||
|
||
# 1. 碎片总数分布
|
||
ax = axes[0, 0]
|
||
bars = ax.bar(positions, df_stats['total_count'], color='steelblue', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('碎片总数', fontsize=12)
|
||
ax.set_title('各位置碎片总数', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 标注前5
|
||
top5_indices = df_stats.nlargest(5, 'total_count').index
|
||
for idx in top5_indices:
|
||
pos = df_stats.loc[idx, 'position']
|
||
count = df_stats.loc[idx, 'total_count']
|
||
ax.text(pos, count, str(int(count)), ha='center', va='bottom', fontsize=9, fontweight='bold')
|
||
|
||
# 2. 独特碎片数
|
||
ax = axes[0, 1]
|
||
ax.bar(positions, df_stats['unique_count'], color='coral', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('独特碎片数', fontsize=12)
|
||
ax.set_title('各位置独特碎片数', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 3. 香农熵(多样性)
|
||
ax = axes[0, 2]
|
||
bars = ax.bar(positions, df_stats['shannon_entropy'], color='mediumseagreen', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('香农熵', fontsize=12)
|
||
ax.set_title('各位置碎片多样性(香农熵)', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 标注熵最高的位置
|
||
top_entropy_idx = df_stats['shannon_entropy'].idxmax()
|
||
pos = df_stats.loc[top_entropy_idx, 'position']
|
||
entropy = df_stats.loc[top_entropy_idx, 'shannon_entropy']
|
||
ax.text(pos, entropy, f'{entropy:.2f}', ha='center', va='bottom', fontsize=9, fontweight='bold')
|
||
|
||
# 4. 结构多样性
|
||
ax = axes[1, 0]
|
||
ax.bar(positions, df_stats['structural_diversity'], color='mediumpurple', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('结构多样性', fontsize=12)
|
||
ax.set_title('各位置结构多样性(指纹相似度)', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 5. 分子量分布
|
||
ax = axes[1, 1]
|
||
ax.errorbar(positions, df_stats['mw_mean'], yerr=df_stats['mw_std'],
|
||
fmt='o-', color='darkred', ecolor='lightcoral', capsize=5, capthick=2, alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('分子量 (Da)', fontsize=12)
|
||
ax.set_title('各位置碎片平均分子量', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 6. 原子数分布
|
||
ax = axes[1, 2]
|
||
ax.errorbar(positions, df_stats['atom_mean'], yerr=df_stats['atom_std'],
|
||
fmt='s-', color='darkgreen', ecolor='lightgreen', capsize=5, capthick=2, alpha=0.7)
|
||
ax.set_xlabel('环位置', fontsize=12)
|
||
ax.set_ylabel('原子数', fontsize=12)
|
||
ax.set_title('各位置碎片平均原子数', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
plt.tight_layout()
|
||
|
||
# 保存图表
|
||
plot_path = output_dir / f'ring{ring_size}_position_analysis.png'
|
||
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
||
print(f"✓ 图表已保存: {plot_path}")
|
||
|
||
plt.show()
|
||
|
||
|
||
def plot_fragment_properties_distribution(fragments, ring_size, output_dir):
|
||
"""绘制碎片理化性质分布"""
|
||
|
||
# 计算所有碎片的性质
|
||
all_props = []
|
||
unique_smiles = set([f['fragment_smiles'] for f in fragments])
|
||
|
||
for smiles in unique_smiles:
|
||
props = calculate_fragment_properties(smiles)
|
||
if props:
|
||
all_props.append(props)
|
||
|
||
df_props = pd.DataFrame(all_props)
|
||
|
||
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
|
||
fig.suptitle(f'{ring_size}元环碎片理化性质分布', fontsize=16, fontweight='bold')
|
||
|
||
# 1. 分子量分布
|
||
ax = axes[0, 0]
|
||
ax.hist(df_props['molecular_weight'], bins=30, color='steelblue', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('分子量 (Da)', fontsize=12)
|
||
ax.set_ylabel('频数', fontsize=12)
|
||
ax.set_title('分子量分布', fontsize=14, fontweight='bold')
|
||
ax.axvline(df_props['molecular_weight'].median(), color='red', linestyle='--',
|
||
label=f'中位数: {df_props["molecular_weight"].median():.1f}')
|
||
ax.legend()
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 2. 原子数分布
|
||
ax = axes[0, 1]
|
||
ax.hist(df_props['num_atoms'], bins=range(1, int(df_props['num_atoms'].max())+2),
|
||
color='coral', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('原子数', fontsize=12)
|
||
ax.set_ylabel('频数', fontsize=12)
|
||
ax.set_title('原子数分布', fontsize=14, fontweight='bold')
|
||
ax.axvline(df_props['num_atoms'].median(), color='red', linestyle='--',
|
||
label=f'中位数: {df_props["num_atoms"].median():.0f}')
|
||
ax.legend()
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 3. LogP分布
|
||
ax = axes[0, 2]
|
||
ax.hist(df_props['logp'], bins=30, color='mediumseagreen', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('LogP', fontsize=12)
|
||
ax.set_ylabel('频数', fontsize=12)
|
||
ax.set_title('LogP分布(亲脂性)', fontsize=14, fontweight='bold')
|
||
ax.axvline(df_props['logp'].median(), color='red', linestyle='--',
|
||
label=f'中位数: {df_props["logp"].median():.2f}')
|
||
ax.legend()
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 4. TPSA分布
|
||
ax = axes[1, 0]
|
||
ax.hist(df_props['tpsa'], bins=30, color='mediumpurple', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('TPSA (Ų)', fontsize=12)
|
||
ax.set_ylabel('频数', fontsize=12)
|
||
ax.set_title('极性表面积分布', fontsize=14, fontweight='bold')
|
||
ax.axvline(df_props['tpsa'].median(), color='red', linestyle='--',
|
||
label=f'中位数: {df_props["tpsa"].median():.1f}')
|
||
ax.legend()
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 5. 可旋转键数分布
|
||
ax = axes[1, 1]
|
||
ax.hist(df_props['num_rotatable_bonds'],
|
||
bins=range(int(df_props['num_rotatable_bonds'].max())+2),
|
||
color='darkgoldenrod', edgecolor='black', alpha=0.7)
|
||
ax.set_xlabel('可旋转键数', fontsize=12)
|
||
ax.set_ylabel('频数', fontsize=12)
|
||
ax.set_title('可旋转键数分布(柔性)', fontsize=14, fontweight='bold')
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
# 6. 氢键统计
|
||
ax = axes[1, 2]
|
||
width = 0.35
|
||
x = np.arange(2)
|
||
hbd_mean = df_props['num_hbd'].mean()
|
||
hba_mean = df_props['num_hba'].mean()
|
||
ax.bar(x[0]-width/2, hbd_mean, width, label='氢键供体 (HBD)', color='lightcoral', edgecolor='black')
|
||
ax.bar(x[1]+width/2, hba_mean, width, label='氢键受体 (HBA)', color='lightblue', edgecolor='black')
|
||
ax.set_ylabel('平均数量', fontsize=12)
|
||
ax.set_title('氢键能力统计', fontsize=14, fontweight='bold')
|
||
ax.set_xticks(x)
|
||
ax.set_xticklabels(['HBD', 'HBA'])
|
||
ax.legend()
|
||
ax.grid(axis='y', alpha=0.3)
|
||
|
||
plt.tight_layout()
|
||
|
||
# 保存图表
|
||
plot_path = output_dir / f'ring{ring_size}_properties_distribution.png'
|
||
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
|
||
print(f"✓ 图表已保存: {plot_path}")
|
||
|
||
plt.show()
|
||
|
||
return df_props
|
||
|
||
|
||
def generate_analysis_report(ring_size):
|
||
"""生成完整的分析报告"""
|
||
|
||
print("="*80)
|
||
print(f"{ring_size}元环碎片分析")
|
||
print("="*80)
|
||
print()
|
||
|
||
# 加载碎片
|
||
print("加载碎片数据...")
|
||
fragments = load_fragments_from_ring_size(ring_size)
|
||
|
||
if fragments is None or len(fragments) == 0:
|
||
print(f"❌ 没有找到碎片数据")
|
||
return
|
||
|
||
print(f"✓ 加载了 {len(fragments)} 个碎片")
|
||
print()
|
||
|
||
# 创建输出文件夹
|
||
output_dir = Path(f'output/ring{ring_size}_analysis')
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 1. 位置统计分析
|
||
print("分析各位置统计信息...")
|
||
df_stats = analyze_position_statistics(fragments, ring_size)
|
||
|
||
# 保存统计表格
|
||
stats_csv = output_dir / f'ring{ring_size}_position_statistics.csv'
|
||
df_stats.to_csv(stats_csv, index=False, encoding='utf-8')
|
||
print(f"✓ 统计表格已保存: {stats_csv}")
|
||
print()
|
||
|
||
# 2. 打印关键结果
|
||
print("="*80)
|
||
print("关键发现")
|
||
print("="*80)
|
||
print()
|
||
|
||
# 裂解最多的前5个位置
|
||
top5_positions = df_stats.nlargest(5, 'total_count')
|
||
print("📊 裂解最多的前5个位置:")
|
||
for i, row in enumerate(top5_positions.itertuples(), 1):
|
||
print(f" {i}. 位置 {row.position}: {row.total_count} 个碎片 "
|
||
f"({row.unique_count} 种独特结构)")
|
||
print()
|
||
|
||
# 多样性最大的前5个位置(基于香农熵)
|
||
top5_diversity = df_stats.nlargest(5, 'shannon_entropy')
|
||
print("🌈 多样性最大的前5个位置(香农熵):")
|
||
for i, row in enumerate(top5_diversity.itertuples(), 1):
|
||
print(f" {i}. 位置 {row.position}: 熵={row.shannon_entropy:.3f}, "
|
||
f"独特比={row.uniqueness_ratio:.2%}, "
|
||
f"{row.unique_count}/{row.total_count} 独特")
|
||
print()
|
||
|
||
# 结构多样性最大的前5个位置
|
||
top5_structural = df_stats.nlargest(5, 'structural_diversity')
|
||
print("🔬 结构多样性最大的前5个位置(指纹相似度):")
|
||
for i, row in enumerate(top5_structural.itertuples(), 1):
|
||
print(f" {i}. 位置 {row.position}: 多样性={row.structural_diversity:.3f}, "
|
||
f"{row.unique_count} 种结构")
|
||
print()
|
||
|
||
# 碎片大小变化最大的位置
|
||
top5_mw_variance = df_stats.nlargest(5, 'mw_std')
|
||
print("⚖️ 分子量变化最大的前5个位置:")
|
||
for i, row in enumerate(top5_mw_variance.itertuples(), 1):
|
||
print(f" {i}. 位置 {row.position}: 平均={row.mw_mean:.1f} Da, "
|
||
f"标准差={row.mw_std:.1f} Da")
|
||
print()
|
||
|
||
# 3. 绘制图表
|
||
print("生成可视化图表...")
|
||
plot_position_analysis(df_stats, ring_size, output_dir)
|
||
print()
|
||
|
||
# 4. 理化性质分析
|
||
print("分析碎片理化性质...")
|
||
df_props = plot_fragment_properties_distribution(fragments, ring_size, output_dir)
|
||
|
||
# 保存理化性质表格
|
||
props_csv = output_dir / f'ring{ring_size}_fragment_properties.csv'
|
||
df_props.to_csv(props_csv, index=False, encoding='utf-8')
|
||
print(f"✓ 理化性质表格已保存: {props_csv}")
|
||
print()
|
||
|
||
# 5. 总结统计
|
||
print("="*80)
|
||
print("总体统计")
|
||
print("="*80)
|
||
print(f"总碎片数: {len(fragments)}")
|
||
print(f"独特碎片数: {len(set([f['fragment_smiles'] for f in fragments]))}")
|
||
print(f"平均分子量: {df_props['molecular_weight'].mean():.2f} ± {df_props['molecular_weight'].std():.2f} Da")
|
||
print(f"平均原子数: {df_props['num_atoms'].mean():.2f} ± {df_props['num_atoms'].std():.2f}")
|
||
print(f"分子量范围: {df_props['molecular_weight'].min():.1f} - {df_props['molecular_weight'].max():.1f} Da")
|
||
print(f"原子数范围: {int(df_props['num_atoms'].min())} - {int(df_props['num_atoms'].max())}")
|
||
print()
|
||
|
||
# 6. 生成文本报告
|
||
report_path = output_dir / f'ring{ring_size}_analysis_report.txt'
|
||
with open(report_path, 'w', encoding='utf-8') as f:
|
||
f.write(f"{ring_size}元环碎片分析报告\n")
|
||
f.write("="*80 + "\n\n")
|
||
f.write(f"生成时间: {pd.Timestamp.now()}\n\n")
|
||
|
||
f.write("位置统计摘要:\n")
|
||
f.write(df_stats.to_string())
|
||
f.write("\n\n")
|
||
|
||
f.write("理化性质摘要:\n")
|
||
f.write(df_props.describe().to_string())
|
||
f.write("\n")
|
||
|
||
print(f"✓ 完整报告已保存: {report_path}")
|
||
print()
|
||
print("="*80)
|
||
print("分析完成!")
|
||
print("="*80)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description='分析大环内酯碎片')
|
||
parser.add_argument('ring_size', type=int, help='环大小(如:16)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
generate_analysis_report(args.ring_size)
|