134 lines
3.8 KiB
R
134 lines
3.8 KiB
R
#!/usr/bin/env Rscript
|
|
|
|
# Load required libraries
|
|
if (!require("gRodon", character.only = TRUE)) {
|
|
stop("Package 'gRodon' is required. Please install it first.")
|
|
}
|
|
if (!require("Biostrings", character.only = TRUE)) {
|
|
stop("Package 'Biostrings' is required. Please install it first.")
|
|
}
|
|
if (!require("dplyr", character.only = TRUE)) {
|
|
stop("Package 'dplyr' is required. Please install it first.")
|
|
}
|
|
|
|
# Parse command-line arguments
|
|
args <- commandArgs(trailingOnly = TRUE)
|
|
if (length(args) != 1) {
|
|
stop("Usage: Rscript GrowthPredict.R <input_directory>\nExample: Rscript GrowthPredict.R ./genome_files")
|
|
}
|
|
input_dir <- args[1]
|
|
|
|
# Check if input directory exists
|
|
if (!dir.exists(input_dir)) {
|
|
stop(paste("Input directory not found:", input_dir))
|
|
}
|
|
|
|
# Find all cds_name.txt files in the input directory
|
|
cds_files <- list.files(
|
|
path = input_dir,
|
|
pattern = "_cds_name\\.txt$", # Match files ending with "_cds_name.txt"
|
|
full.names = TRUE
|
|
)
|
|
|
|
if (length(cds_files) == 0) {
|
|
stop("No files with suffix '_cds_name.txt' found in the input directory.")
|
|
}
|
|
|
|
# Initialize an empty data frame to store results
|
|
result_df <- data.frame(
|
|
genome = character(),
|
|
LowerCI = numeric(),
|
|
UpperCI = numeric(),
|
|
d = numeric(),
|
|
stringsAsFactors = FALSE
|
|
)
|
|
|
|
# Process each cds_name.txt file
|
|
for (cds_file in cds_files) {
|
|
# Extract genome name from cds_file (split by "_cds_name.txt")
|
|
genome_name <- sub("_cds_name\\.txt$", "", basename(cds_file))
|
|
cat(paste("Processing genome:", genome_name, "\n"))
|
|
|
|
# Find corresponding .fna file (matching genome name)
|
|
ffn_file <- list.files(
|
|
path = input_dir,
|
|
pattern = paste0("^", genome_name, ".*\\.(ffn)$"), # Match .fna file starting with genome name
|
|
full.names = TRUE,
|
|
ignore.case = TRUE
|
|
)
|
|
|
|
if (length(ffn_file) == 0) {
|
|
warning(paste("No corresponding .ffn file found for", genome_name, "- skipping"))
|
|
next
|
|
}
|
|
if (length(ffn_file) > 1) {
|
|
warning(paste("Multiple .fna files found for", genome_name, "- using the first one"))
|
|
ffn_file <- ffn_file[1]
|
|
}
|
|
|
|
# Read CDS IDs from cds_name.txt
|
|
if (!file.exists(cds_file)) {
|
|
warning(paste("CDS file", cds_file, "not found - skipping"))
|
|
next
|
|
}
|
|
CDS_IDs <- readLines(cds_file, warn = FALSE)
|
|
|
|
# Read gene sequences from .ffn file
|
|
tryCatch({
|
|
genes <- Biostrings::readDNAStringSet(ffn_file)
|
|
}, error = function(e) {
|
|
warning(paste("Failed to read .fna file", ffn_file, "- skipping:", e$message))
|
|
next
|
|
})
|
|
|
|
# Subset genes to those in CDS_IDs
|
|
gene_IDs <- sub(" .*", "", names(genes)) # Extract ID before first space
|
|
genes <- genes[gene_IDs %in% CDS_IDs]
|
|
|
|
if (length(genes) == 0) {
|
|
warning(paste("No matching genes found for", genome_name, "- skipping"))
|
|
next
|
|
}
|
|
|
|
# Identify highly expressed genes (ribosomal proteins, excluding methyl/hydroxy)
|
|
highly_expressed <- grepl(
|
|
"^(?!.*(methyl|hydroxy)).*0S ribosomal protein",
|
|
names(genes),
|
|
ignore.case = TRUE,
|
|
perl = TRUE
|
|
)
|
|
|
|
# Predict growth parameters
|
|
tryCatch({
|
|
pred_result <- gRodon::predictGrowth(genes, highly_expressed)
|
|
}, error = function(e) {
|
|
warning(paste("Prediction failed for", genome_name, "- skipping:", e$message))
|
|
next
|
|
})
|
|
|
|
# Append results to data frame
|
|
result_df <- dplyr::bind_rows(result_df, data.frame(
|
|
genome = genome_name,
|
|
LowerCI = pred_result$LowerCI,
|
|
UpperCI = pred_result$UpperCI,
|
|
d = pred_result$d,
|
|
stringsAsFactors = FALSE
|
|
))
|
|
}
|
|
|
|
# Check if any results were generated
|
|
if (nrow(result_df) == 0) {
|
|
stop("No valid results generated. Check input files and try again.")
|
|
}
|
|
|
|
# Save results to CSV (tab-separated)
|
|
output_file <- file.path(input_dir, "growthPrediction.csv")
|
|
write.table(
|
|
result_df,
|
|
file = output_file,
|
|
sep = "\t", # Tab-separated
|
|
row.names = FALSE,
|
|
quote = FALSE
|
|
)
|
|
|
|
cat(paste("Results saved to", output_file, "\n")) |