#!/usr/bin/env Rscript # Load required libraries if (!require("gRodon", character.only = TRUE)) { stop("Package 'gRodon' is required. Please install it first.") } if (!require("Biostrings", character.only = TRUE)) { stop("Package 'Biostrings' is required. Please install it first.") } if (!require("dplyr", character.only = TRUE)) { stop("Package 'dplyr' is required. Please install it first.") } # Parse command-line arguments args <- commandArgs(trailingOnly = TRUE) if (length(args) != 1) { stop("Usage: Rscript GrowthPredict.R \nExample: Rscript GrowthPredict.R ./genome_files") } input_dir <- args[1] # Check if input directory exists if (!dir.exists(input_dir)) { stop(paste("Input directory not found:", input_dir)) } # Find all cds_name.txt files in the input directory cds_files <- list.files( path = input_dir, pattern = "_cds_name\\.txt$", # Match files ending with "_cds_name.txt" full.names = TRUE ) if (length(cds_files) == 0) { stop("No files with suffix '_cds_name.txt' found in the input directory.") } # Initialize an empty data frame to store results result_df <- data.frame( genome = character(), LowerCI = numeric(), UpperCI = numeric(), d = numeric(), stringsAsFactors = FALSE ) # Process each cds_name.txt file for (cds_file in cds_files) { # Extract genome name from cds_file (split by "_cds_name.txt") genome_name <- sub("_cds_name\\.txt$", "", basename(cds_file)) cat(paste("Processing genome:", genome_name, "\n")) # Find corresponding .fna file (matching genome name) ffn_file <- list.files( path = input_dir, pattern = paste0("^", genome_name, ".*\\.(ffn)$"), # Match .fna file starting with genome name full.names = TRUE, ignore.case = TRUE ) if (length(ffn_file) == 0) { warning(paste("No corresponding .ffn file found for", genome_name, "- skipping")) next } if (length(ffn_file) > 1) { warning(paste("Multiple .fna files found for", genome_name, "- using the first one")) ffn_file <- ffn_file[1] } # Read CDS IDs from cds_name.txt if (!file.exists(cds_file)) { warning(paste("CDS file", cds_file, "not found - skipping")) next } CDS_IDs <- readLines(cds_file, warn = FALSE) # Read gene sequences from .ffn file tryCatch({ genes <- Biostrings::readDNAStringSet(ffn_file) }, error = function(e) { warning(paste("Failed to read .fna file", ffn_file, "- skipping:", e$message)) next }) # Subset genes to those in CDS_IDs gene_IDs <- sub(" .*", "", names(genes)) # Extract ID before first space genes <- genes[gene_IDs %in% CDS_IDs] if (length(genes) == 0) { warning(paste("No matching genes found for", genome_name, "- skipping")) next } # Identify highly expressed genes (ribosomal proteins, excluding methyl/hydroxy) highly_expressed <- grepl( "^(?!.*(methyl|hydroxy)).*0S ribosomal protein", names(genes), ignore.case = TRUE, perl = TRUE ) # Predict growth parameters tryCatch({ pred_result <- gRodon::predictGrowth(genes, highly_expressed) }, error = function(e) { warning(paste("Prediction failed for", genome_name, "- skipping:", e$message)) next }) # Append results to data frame result_df <- dplyr::bind_rows(result_df, data.frame( genome = genome_name, LowerCI = pred_result$LowerCI, UpperCI = pred_result$UpperCI, d = pred_result$d, stringsAsFactors = FALSE )) } # Check if any results were generated if (nrow(result_df) == 0) { stop("No valid results generated. Check input files and try again.") } # Save results to CSV (tab-separated) output_file <- file.path(input_dir, "growthPrediction.csv") write.table( result_df, file = output_file, sep = "\t", # Tab-separated row.names = FALSE, quote = FALSE ) cat(paste("Results saved to", output_file, "\n"))