labweb/public/scripts/GrowthPredict.R

#!/usr/bin/env Rscript

# Load required libraries
if (!require("gRodon", character.only = TRUE)) {
  stop("Package 'gRodon' is required. Please install it first.")
}
if (!require("Biostrings", character.only = TRUE)) {
  stop("Package 'Biostrings' is required. Please install it first.")
}
if (!require("dplyr", character.only = TRUE)) {
  stop("Package 'dplyr' is required. Please install it first.")
}

# Parse command-line arguments
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1) {
  stop("Usage: Rscript GrowthPredict.R <input_directory>\nExample: Rscript GrowthPredict.R ./genome_files")
}
input_dir <- args[1]

# Check if input directory exists
if (!dir.exists(input_dir)) {
  stop(paste("Input directory not found:", input_dir))
}

# Find all cds_name.txt files in the input directory
cds_files <- list.files(
  path = input_dir,
  pattern = "_cds_name\\.txt$",  # Match files ending with "_cds_name.txt"
  full.names = TRUE
)

if (length(cds_files) == 0) {
  stop("No files with suffix '_cds_name.txt' found in the input directory.")
}

# Initialize an empty data frame to store results
result_df <- data.frame(
  genome = character(),
  LowerCI = numeric(),
  UpperCI = numeric(),
  d = numeric(),
  stringsAsFactors = FALSE
)

# Process each cds_name.txt file
for (cds_file in cds_files) {
  # Extract genome name from cds_file (split by "_cds_name.txt")
  genome_name <- sub("_cds_name\\.txt$", "", basename(cds_file))
  cat(paste("Processing genome:", genome_name, "\n"))

  # Find corresponding .fna file (matching genome name)
  ffn_file <- list.files(
    path = input_dir,
    pattern = paste0("^", genome_name, ".*\\.(ffn)$"),  # Match .fna file starting with genome name
    full.names = TRUE,
    ignore.case = TRUE
  )

  if (length(ffn_file) == 0) {
    warning(paste("No corresponding .ffn file found for", genome_name, "- skipping"))
    next
  }
  if (length(ffn_file) > 1) {
    warning(paste("Multiple .fna files found for", genome_name, "- using the first one"))
    ffn_file <- ffn_file[1]
  }

  # Read CDS IDs from cds_name.txt
  if (!file.exists(cds_file)) {
    warning(paste("CDS file", cds_file, "not found - skipping"))
    next
  }
  CDS_IDs <- readLines(cds_file, warn = FALSE)

  # Read gene sequences from .ffn file
  tryCatch({
    genes <- Biostrings::readDNAStringSet(ffn_file)
  }, error = function(e) {
    warning(paste("Failed to read .fna file", ffn_file, "- skipping:", e$message))
    next
  })

  # Subset genes to those in CDS_IDs
  gene_IDs <- sub(" .*", "", names(genes))  # Extract ID before first space
  genes <- genes[gene_IDs %in% CDS_IDs]

  if (length(genes) == 0) {
    warning(paste("No matching genes found for", genome_name, "- skipping"))
    next
  }

  # Identify highly expressed genes (ribosomal proteins, excluding methyl/hydroxy)
  highly_expressed <- grepl(
    "^(?!.*(methyl|hydroxy)).*0S ribosomal protein",
    names(genes),
    ignore.case = TRUE,
    perl = TRUE
  )

  # Predict growth parameters
  tryCatch({
    pred_result <- gRodon::predictGrowth(genes, highly_expressed)
  }, error = function(e) {
    warning(paste("Prediction failed for", genome_name, "- skipping:", e$message))
    next
  })

  # Append results to data frame
  result_df <- dplyr::bind_rows(result_df, data.frame(
    genome = genome_name,
    LowerCI = pred_result$LowerCI,
    UpperCI = pred_result$UpperCI,
    d = pred_result$d,
    stringsAsFactors = FALSE
  ))
}

# Check if any results were generated
if (nrow(result_df) == 0) {
  stop("No valid results generated. Check input files and try again.")
}

# Save results to CSV (tab-separated)
output_file <- file.path(input_dir, "growthPrediction.csv")
write.table(
  result_df,
  file = output_file,
  sep = "\t",  # Tab-separated
  row.names = FALSE,
  quote = FALSE
)

cat(paste("Results saved to", output_file, "\n"))