feat(toolkit): ship macro_lactone_toolkit package

Unify macrolactone detection, numbering, fragmentation, and splicing under the installable macro_lactone_toolkit package. - replace legacy src.* modules with the new package layout - add analyze/number/fragment CLI entrypoints and pixi tasks - migrate tests, README, and scripts to the new package API
2026-03-18 22:06:45 +08:00
parent a768d26e47
commit 5e7b236f31
45 changed files with 1302 additions and 6304 deletions
--- a/scripts/batch_process.py
+++ b/scripts/batch_process.py
@@ -1,165 +1,10 @@
-"""
-Batch processing script for analyzing all macrolactones in the dataset.
-"""
+from __future__ import annotations
+
 import sys
-sys.path.append('..')

-import pandas as pd
-from pathlib import Path
-from tqdm import tqdm
-import json
-from rdkit import Chem
-
-from src.fragment_cleaver import process_molecule
-from src.fragment_dataclass import MoleculeFragments
-
-
-def batch_process_molecules(csv_path: str, output_base_dir: str, 
-                            max_molecules: int = None):
-    """
-    Process all molecules in the CSV file.
-    
-    Args:
-        csv_path: Path to the CSV file containing SMILES
-        output_base_dir: Base directory for output
-        max_molecules: Maximum number of molecules to process (None for all)
-    """
-    # Read CSV
-    df = pd.read_csv(csv_path)
-    print(f"Loaded {len(df)} molecules from {csv_path}")
-    
-    if max_molecules:
-        df = df.head(max_molecules)
-        print(f"Processing first {max_molecules} molecules")
-    
-    # Create output directory
-    output_dir = Path(output_base_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    
-    # Statistics
-    successful = 0
-    failed = 0
-    failed_molecules = []
-    all_fragments = []
-    
-    # Process each molecule
-    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing molecules"):
-        smiles = row['smiles']
-        molecule_id = row.get('IDs', f'molecule_{idx}')
-        
-        try:
-            # Process molecule
-            mol_fragments = process_molecule(smiles, idx)
-            
-            if mol_fragments is None or len(mol_fragments.fragments) == 0:
-                failed += 1
-                failed_molecules.append({
-                    'index': idx,
-                    'id': molecule_id,
-                    'reason': 'No fragments extracted'
-                })
-                continue
-            
-            # Create output directory for this molecule
-            mol_output_dir = output_dir / mol_fragments.parent_id
-            mol_output_dir.mkdir(parents=True, exist_ok=True)
-            
-            # Save complete molecule fragments
-            mol_fragments_path = mol_output_dir / f"{mol_fragments.parent_id}_all_fragments.json"
-            mol_fragments.to_json_file(str(mol_fragments_path))
-            
-            # Save individual fragments
-            for frag in mol_fragments.fragments:
-                frag_path = mol_output_dir / f"{frag.fragment_id}.json"
-                frag.to_json_file(str(frag_path))
-                
-                # Collect for overall statistics
-                all_fragments.append({
-                    'parent_id': frag.parent_id,
-                    'fragment_id': frag.fragment_id,
-                    'cleavage_position': frag.cleavage_position,
-                    'fragment_smiles': frag.fragment_smiles,
-                    'atom_count': frag.atom_count,
-                    'molecular_weight': frag.molecular_weight,
-                    'parent_smiles': frag.parent_smiles
-                })
-            
-            successful += 1
-            
-        except Exception as e:
-            failed += 1
-            failed_molecules.append({
-                'index': idx,
-                'id': molecule_id,
-                'error': str(e)
-            })
-            print(f"\nError processing molecule {idx} ({molecule_id}): {e}")
-    
-    # Save overall statistics
-    stats = {
-        'total_molecules': len(df),
-        'successful': successful,
-        'failed': failed,
-        'total_fragments': len(all_fragments),
-        'failed_molecules': failed_molecules
-    }
-    
-    stats_path = output_dir / 'processing_stats.json'
-    with open(stats_path, 'w') as f:
-        json.dump(stats, f, indent=2)
-    
-    # Save all fragments as CSV for easy analysis
-    if all_fragments:
-        fragments_df = pd.DataFrame(all_fragments)
-        fragments_csv_path = output_dir / 'all_fragments.csv'
-        fragments_df.to_csv(fragments_csv_path, index=False)
-        print(f"\n✓ Saved all fragments to: {fragments_csv_path}")
-    
-    # Print summary
-    print(f"\n{'='*60}")
-    print(f"PROCESSING COMPLETE")
-    print(f"{'='*60}")
-    print(f"Total molecules: {len(df)}")
-    print(f"Successfully processed: {successful}")
-    print(f"Failed: {failed}")
-    print(f"Total fragments extracted: {len(all_fragments)}")
-    print(f"{'='*60}")
-    print(f"\nResults saved to: {output_dir}")
-    print(f"Statistics saved to: {stats_path}")
-    
-    return fragments_df if all_fragments else None
+from macro_lactone_toolkit.cli import main


 if __name__ == "__main__":
-    import argparse
-    
-    parser = argparse.ArgumentParser(
-        description="Batch process macrolactones to extract side chain fragments"
-    )
-    parser.add_argument(
-        '--input',
-        type=str,
-        default='../ring16/temp.csv',
-        help='Input CSV file path'
-    )
-    parser.add_argument(
-        '--output',
-        type=str,
-        default='../output/fragments',
-        help='Output directory path'
-    )
-    parser.add_argument(
-        '--max',
-        type=int,
-        default=None,
-        help='Maximum number of molecules to process (default: all)'
-    )
-    
-    args = parser.parse_args()
-    
-    batch_process_molecules(
-        csv_path=args.input,
-        output_base_dir=args.output,
-        max_molecules=args.max
-    )
-
+    sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]]
+    main()