""" Batch processing script for analyzing all macrolactones in the dataset. """ import sys sys.path.append('..') import pandas as pd from pathlib import Path from tqdm import tqdm import json from rdkit import Chem from src.fragment_cleaver import process_molecule from src.fragment_dataclass import MoleculeFragments def batch_process_molecules(csv_path: str, output_base_dir: str, max_molecules: int = None): """ Process all molecules in the CSV file. Args: csv_path: Path to the CSV file containing SMILES output_base_dir: Base directory for output max_molecules: Maximum number of molecules to process (None for all) """ # Read CSV df = pd.read_csv(csv_path) print(f"Loaded {len(df)} molecules from {csv_path}") if max_molecules: df = df.head(max_molecules) print(f"Processing first {max_molecules} molecules") # Create output directory output_dir = Path(output_base_dir) output_dir.mkdir(parents=True, exist_ok=True) # Statistics successful = 0 failed = 0 failed_molecules = [] all_fragments = [] # Process each molecule for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing molecules"): smiles = row['smiles'] molecule_id = row.get('IDs', f'molecule_{idx}') try: # Process molecule mol_fragments = process_molecule(smiles, idx) if mol_fragments is None or len(mol_fragments.fragments) == 0: failed += 1 failed_molecules.append({ 'index': idx, 'id': molecule_id, 'reason': 'No fragments extracted' }) continue # Create output directory for this molecule mol_output_dir = output_dir / mol_fragments.parent_id mol_output_dir.mkdir(parents=True, exist_ok=True) # Save complete molecule fragments mol_fragments_path = mol_output_dir / f"{mol_fragments.parent_id}_all_fragments.json" mol_fragments.to_json_file(str(mol_fragments_path)) # Save individual fragments for frag in mol_fragments.fragments: frag_path = mol_output_dir / f"{frag.fragment_id}.json" frag.to_json_file(str(frag_path)) # Collect for overall statistics all_fragments.append({ 'parent_id': frag.parent_id, 'fragment_id': frag.fragment_id, 'cleavage_position': frag.cleavage_position, 'fragment_smiles': frag.fragment_smiles, 'atom_count': frag.atom_count, 'molecular_weight': frag.molecular_weight, 'parent_smiles': frag.parent_smiles }) successful += 1 except Exception as e: failed += 1 failed_molecules.append({ 'index': idx, 'id': molecule_id, 'error': str(e) }) print(f"\nError processing molecule {idx} ({molecule_id}): {e}") # Save overall statistics stats = { 'total_molecules': len(df), 'successful': successful, 'failed': failed, 'total_fragments': len(all_fragments), 'failed_molecules': failed_molecules } stats_path = output_dir / 'processing_stats.json' with open(stats_path, 'w') as f: json.dump(stats, f, indent=2) # Save all fragments as CSV for easy analysis if all_fragments: fragments_df = pd.DataFrame(all_fragments) fragments_csv_path = output_dir / 'all_fragments.csv' fragments_df.to_csv(fragments_csv_path, index=False) print(f"\n✓ Saved all fragments to: {fragments_csv_path}") # Print summary print(f"\n{'='*60}") print(f"PROCESSING COMPLETE") print(f"{'='*60}") print(f"Total molecules: {len(df)}") print(f"Successfully processed: {successful}") print(f"Failed: {failed}") print(f"Total fragments extracted: {len(all_fragments)}") print(f"{'='*60}") print(f"\nResults saved to: {output_dir}") print(f"Statistics saved to: {stats_path}") return fragments_df if all_fragments else None if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Batch process macrolactones to extract side chain fragments" ) parser.add_argument( '--input', type=str, default='../ring16/temp.csv', help='Input CSV file path' ) parser.add_argument( '--output', type=str, default='../output/fragments', help='Output directory path' ) parser.add_argument( '--max', type=int, default=None, help='Maximum number of molecules to process (default: all)' ) args = parser.parse_args() batch_process_molecules( csv_path=args.input, output_base_dir=args.output, max_molecules=args.max )