feat(toolkit): ship macro_lactone_toolkit package

Unify macrolactone detection, numbering, fragmentation, and
splicing under the installable macro_lactone_toolkit package.

- replace legacy src.* modules with the new package layout
- add analyze/number/fragment CLI entrypoints and pixi tasks
- migrate tests, README, and scripts to the new package API
This commit is contained in:
2026-03-18 22:06:45 +08:00
parent a768d26e47
commit 5e7b236f31
45 changed files with 1302 additions and 6304 deletions

View File

@@ -1,165 +1,10 @@
"""
Batch processing script for analyzing all macrolactones in the dataset.
"""
from __future__ import annotations
import sys
sys.path.append('..')
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import json
from rdkit import Chem
from src.fragment_cleaver import process_molecule
from src.fragment_dataclass import MoleculeFragments
def batch_process_molecules(csv_path: str, output_base_dir: str,
max_molecules: int = None):
"""
Process all molecules in the CSV file.
Args:
csv_path: Path to the CSV file containing SMILES
output_base_dir: Base directory for output
max_molecules: Maximum number of molecules to process (None for all)
"""
# Read CSV
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} molecules from {csv_path}")
if max_molecules:
df = df.head(max_molecules)
print(f"Processing first {max_molecules} molecules")
# Create output directory
output_dir = Path(output_base_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Statistics
successful = 0
failed = 0
failed_molecules = []
all_fragments = []
# Process each molecule
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing molecules"):
smiles = row['smiles']
molecule_id = row.get('IDs', f'molecule_{idx}')
try:
# Process molecule
mol_fragments = process_molecule(smiles, idx)
if mol_fragments is None or len(mol_fragments.fragments) == 0:
failed += 1
failed_molecules.append({
'index': idx,
'id': molecule_id,
'reason': 'No fragments extracted'
})
continue
# Create output directory for this molecule
mol_output_dir = output_dir / mol_fragments.parent_id
mol_output_dir.mkdir(parents=True, exist_ok=True)
# Save complete molecule fragments
mol_fragments_path = mol_output_dir / f"{mol_fragments.parent_id}_all_fragments.json"
mol_fragments.to_json_file(str(mol_fragments_path))
# Save individual fragments
for frag in mol_fragments.fragments:
frag_path = mol_output_dir / f"{frag.fragment_id}.json"
frag.to_json_file(str(frag_path))
# Collect for overall statistics
all_fragments.append({
'parent_id': frag.parent_id,
'fragment_id': frag.fragment_id,
'cleavage_position': frag.cleavage_position,
'fragment_smiles': frag.fragment_smiles,
'atom_count': frag.atom_count,
'molecular_weight': frag.molecular_weight,
'parent_smiles': frag.parent_smiles
})
successful += 1
except Exception as e:
failed += 1
failed_molecules.append({
'index': idx,
'id': molecule_id,
'error': str(e)
})
print(f"\nError processing molecule {idx} ({molecule_id}): {e}")
# Save overall statistics
stats = {
'total_molecules': len(df),
'successful': successful,
'failed': failed,
'total_fragments': len(all_fragments),
'failed_molecules': failed_molecules
}
stats_path = output_dir / 'processing_stats.json'
with open(stats_path, 'w') as f:
json.dump(stats, f, indent=2)
# Save all fragments as CSV for easy analysis
if all_fragments:
fragments_df = pd.DataFrame(all_fragments)
fragments_csv_path = output_dir / 'all_fragments.csv'
fragments_df.to_csv(fragments_csv_path, index=False)
print(f"\n✓ Saved all fragments to: {fragments_csv_path}")
# Print summary
print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Total molecules: {len(df)}")
print(f"Successfully processed: {successful}")
print(f"Failed: {failed}")
print(f"Total fragments extracted: {len(all_fragments)}")
print(f"{'='*60}")
print(f"\nResults saved to: {output_dir}")
print(f"Statistics saved to: {stats_path}")
return fragments_df if all_fragments else None
from macro_lactone_toolkit.cli import main
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Batch process macrolactones to extract side chain fragments"
)
parser.add_argument(
'--input',
type=str,
default='../ring16/temp.csv',
help='Input CSV file path'
)
parser.add_argument(
'--output',
type=str,
default='../output/fragments',
help='Output directory path'
)
parser.add_argument(
'--max',
type=int,
default=None,
help='Maximum number of molecules to process (default: all)'
)
args = parser.parse_args()
batch_process_molecules(
csv_path=args.input,
output_base_dir=args.output,
max_molecules=args.max
)
sys.argv = ["macro-lactone-toolkit", "fragment", *sys.argv[1:]]
main()