Files
labweb/public/scripts/GrowthPredict.R
2025-12-16 11:39:15 +08:00

134 lines
3.8 KiB
R

#!/usr/bin/env Rscript
# Load required libraries
if (!require("gRodon", character.only = TRUE)) {
stop("Package 'gRodon' is required. Please install it first.")
}
if (!require("Biostrings", character.only = TRUE)) {
stop("Package 'Biostrings' is required. Please install it first.")
}
if (!require("dplyr", character.only = TRUE)) {
stop("Package 'dplyr' is required. Please install it first.")
}
# Parse command-line arguments
args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 1) {
stop("Usage: Rscript GrowthPredict.R <input_directory>\nExample: Rscript GrowthPredict.R ./genome_files")
}
input_dir <- args[1]
# Check if input directory exists
if (!dir.exists(input_dir)) {
stop(paste("Input directory not found:", input_dir))
}
# Find all cds_name.txt files in the input directory
cds_files <- list.files(
path = input_dir,
pattern = "_cds_name\\.txt$", # Match files ending with "_cds_name.txt"
full.names = TRUE
)
if (length(cds_files) == 0) {
stop("No files with suffix '_cds_name.txt' found in the input directory.")
}
# Initialize an empty data frame to store results
result_df <- data.frame(
genome = character(),
LowerCI = numeric(),
UpperCI = numeric(),
d = numeric(),
stringsAsFactors = FALSE
)
# Process each cds_name.txt file
for (cds_file in cds_files) {
# Extract genome name from cds_file (split by "_cds_name.txt")
genome_name <- sub("_cds_name\\.txt$", "", basename(cds_file))
cat(paste("Processing genome:", genome_name, "\n"))
# Find corresponding .fna file (matching genome name)
ffn_file <- list.files(
path = input_dir,
pattern = paste0("^", genome_name, ".*\\.(ffn)$"), # Match .fna file starting with genome name
full.names = TRUE,
ignore.case = TRUE
)
if (length(ffn_file) == 0) {
warning(paste("No corresponding .ffn file found for", genome_name, "- skipping"))
next
}
if (length(ffn_file) > 1) {
warning(paste("Multiple .fna files found for", genome_name, "- using the first one"))
ffn_file <- ffn_file[1]
}
# Read CDS IDs from cds_name.txt
if (!file.exists(cds_file)) {
warning(paste("CDS file", cds_file, "not found - skipping"))
next
}
CDS_IDs <- readLines(cds_file, warn = FALSE)
# Read gene sequences from .ffn file
tryCatch({
genes <- Biostrings::readDNAStringSet(ffn_file)
}, error = function(e) {
warning(paste("Failed to read .fna file", ffn_file, "- skipping:", e$message))
next
})
# Subset genes to those in CDS_IDs
gene_IDs <- sub(" .*", "", names(genes)) # Extract ID before first space
genes <- genes[gene_IDs %in% CDS_IDs]
if (length(genes) == 0) {
warning(paste("No matching genes found for", genome_name, "- skipping"))
next
}
# Identify highly expressed genes (ribosomal proteins, excluding methyl/hydroxy)
highly_expressed <- grepl(
"^(?!.*(methyl|hydroxy)).*0S ribosomal protein",
names(genes),
ignore.case = TRUE,
perl = TRUE
)
# Predict growth parameters
tryCatch({
pred_result <- gRodon::predictGrowth(genes, highly_expressed)
}, error = function(e) {
warning(paste("Prediction failed for", genome_name, "- skipping:", e$message))
next
})
# Append results to data frame
result_df <- dplyr::bind_rows(result_df, data.frame(
genome = genome_name,
LowerCI = pred_result$LowerCI,
UpperCI = pred_result$UpperCI,
d = pred_result$d,
stringsAsFactors = FALSE
))
}
# Check if any results were generated
if (nrow(result_df) == 0) {
stop("No valid results generated. Check input files and try again.")
}
# Save results to CSV (tab-separated)
output_file <- file.path(input_dir, "growthPrediction.csv")
write.table(
result_df,
file = output_file,
sep = "\t", # Tab-separated
row.names = FALSE,
quote = FALSE
)
cat(paste("Results saved to", output_file, "\n"))