from rdkit import Chem from joblib import Parallel, delayed import logging from collections import Counter # 定义日志配置 logging.basicConfig( filename="rgroup_matching.log", level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) # 定义 SMARTS 模式 macro = Chem.MolFromSmarts("[r12,r13,r14,r15,r16,r17,r18,r19,r20]([#8][#6](=[#8]))") # 读取 SMI 文件 smi_file = "/home/mambauser/LillyMol/test/1M_stratsampled_V1B.smi" with open(smi_file, 'r') as f: SMILES_list = [line.strip() for line in f if line.strip()] logging.info(f"Loaded {len(SMILES_list)} molecules from {smi_file}.") # 匹配和最大环统计函数 def match_and_ring_analysis(smiles): mol = Chem.MolFromSmiles(smiles) if mol is None: return smiles, None, 0 # 无效分子 result = mol.GetSubstructMatches(macro) ri = mol.GetRingInfo() largest_ring_size = max((len(r) for r in ri.AtomRings()), default=0) return smiles, result, largest_ring_size # 使用 joblib 并行处理 logging.info("Starting SMARTS matching and ring analysis...") results = Parallel(n_jobs=-1)(delayed(match_and_ring_analysis)(s) for s in SMILES_list) # 分离成功、失败的分子以及最大环大小 success = [smiles for smiles, result, _ in results if result] fail = [smiles for smiles, result, _ in results if not result] ring_sizes = [largest_ring_size for _, _, largest_ring_size in results] # 统计最大环频数 ring_size_counter = Counter(ring_sizes) # 统计结果 total = len(success) + len(fail) success_rate = len(success) / total * 100 if total > 0 else 0 # 保存日志信息 logging.info(f"Total molecules: {total}") logging.info(f"Success: {len(success)}") logging.info(f"Fail: {len(fail)}") logging.info(f"Success rate: {success_rate:.2f}%") logging.info(f"Ring size distribution: {dict(ring_size_counter)}") print(f"Total molecules: {total}") print(f"Success: {len(success)}") print(f"Fail: {len(fail)}") print(f"Success rate: {success_rate:.2f}%") print("Ring size distribution:") for size, count in sorted(ring_size_counter.items()): print(f" Ring size {size}: {count}") # 将失败的分子写入到一个 SMI 文件 fail_smi_file = "fail_molecules.smi" with open(fail_smi_file, "w") as ff: for smiles in fail: ff.write(smiles + "\n") logging.info(f"Failed molecules written to {fail_smi_file}.") print(f"Failed molecules written to {fail_smi_file}.") # 将环大小分布写入文件 ring_size_file = "ring_size_distribution.txt" with open(ring_size_file, "w") as rf: rf.write("Ring Size\tCount\n") for size, count in sorted(ring_size_counter.items()): rf.write(f"{size}\t{count}\n") logging.info(f"Ring size distribution written to {ring_size_file}.") print(f"Ring size distribution written to {ring_size_file}.")