Files
LillyMol/utils/ring_analysis.py
mm644706215 64d80d2e3c first add
2025-09-17 22:14:05 +08:00

84 lines
2.8 KiB
Python
Executable File

from rdkit import Chem
from joblib import Parallel, delayed
import logging
from collections import Counter
# 定义日志配置
logging.basicConfig(
filename="rgroup_matching.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
# 定义 SMARTS 模式
macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))")
# 读取 SMI 文件
smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi"
with open(smi_file, 'r') as f:
SMILES_list = [line.strip() for line in f if line.strip()]
logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.")
# 匹配和最大环统计函数
def match_and_ring_analysis(smiles):
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return smiles, None, 0 # 无效分子
result = mol.GetSubstructMatches(macro)
ri = mol.GetRingInfo()
largest_ring_size = max((len(r) for r in ri.AtomRings()), default=0)
return smiles, result, largest_ring_size
# 使用 joblib 并行处理
logging.info("Starting SMARTS matching and ring analysis...")
results = Parallel(n_jobs=-1)(delayed(match_and_ring_analysis)(s) for s in SMILES_list)
# 分离成功、失败的分子以及最大环大小
success = [smiles for smiles, result, _ in results if result]
fail = [smiles for smiles, result, _ in results if not result]
ring_sizes = [largest_ring_size for _, _, largest_ring_size in results]
# 统计最大环频数
ring_size_counter = Counter(ring_sizes)
# 统计结果
total = len(success) + len(fail)
success_rate = len(success) / total * 100 if total > 0 else 0
# 保存日志信息
logging.info(f"Total molecules: {total}")
logging.info(f"Success: {len(success)}")
logging.info(f"Fail: {len(fail)}")
logging.info(f"Success rate: {success_rate:.2f}%")
logging.info(f"Ring size distribution: {dict(ring_size_counter)}")
print(f"Total molecules: {total}")
print(f"Success: {len(success)}")
print(f"Fail: {len(fail)}")
print(f"Success rate: {success_rate:.2f}%")
print("Ring size distribution:")
for size, count in sorted(ring_size_counter.items()):
print(f" Ring size {size}: {count}")
# 将失败的分子写入到一个 SMI 文件
fail_smi_file = "fail_molecules.smi"
with open(fail_smi_file, "w") as ff:
for smiles in fail:
ff.write(smiles + "\n")
logging.info(f"Failed molecules written to {fail_smi_file}.")
print(f"Failed molecules written to {fail_smi_file}.")
# 将环大小分布写入文件
ring_size_file = "ring_size_distribution.txt"
with open(ring_size_file, "w") as rf:
rf.write("Ring Size\tCount\n")
for size, count in sorted(ring_size_counter.items()):
rf.write(f"{size}\t{count}\n")
logging.info(f"Ring size distribution written to {ring_size_file}.")
print(f"Ring size distribution written to {ring_size_file}.")