labweb/public/scripts/GrowthPredict.sh

#!/bin/bash
# Script name: prepare_growth_prediction.sh
# Function: Copy GFF/FNA files, extract CDS names for each GFF, and run R script
# Usage: ./prepare_growth_prediction.sh <prokka_annotation_path>
# Example: ./prepare_growth_prediction.sh ./prokka_annotation

# Check input parameter
if [ $# -ne 2 ]; then
  echo "Error: Incorrect number of arguments!"
  echo "Usage: $0 <prokka_annotation_path> <growth_predict_dir>"
  echo "Example: $0 ./prokka_annotation ./growth_predict_dir"
  exit 1
fi

PROKKA_PATH="$1"
OUTPUT_DIR="$2"

# Create output directory
mkdir -p "$OUTPUT_DIR" || {
  echo "Error: Failed to create $OUTPUT_DIR"
  exit 1
}

# Check Prokka directory exists
if [ ! -d "$PROKKA_PATH" ]; then
  echo "Error: $PROKKA_PATH does not exist"
  exit 1
fi

# Find and copy GFF/FNA/FASTA files
GFF_FILES=$(find "$PROKKA_PATH" -type f -name "*.gff")
FNA_FILES=$(find "$PROKKA_PATH" -type f -name "*.fna")
FFN_FILES=$(find "$PROKKA_PATH" -type f -name "*.ffn")

# Copy GFF files
if [ -n "$GFF_FILES" ]; then
  echo "Copying GFF files to $OUTPUT_DIR..."
  cp -v $GFF_FILES "$OUTPUT_DIR/" || {
    echo "Error: Failed to copy GFF files"
    exit 1
  }
else
  echo "Warning: No GFF files found in $PROKKA_PATH"
  exit 1  # GFF is required for CDS extraction, exit if none found
fi

# Copy FNA/FASTA files
if [ -n "$FNA_FILES" ]; then
  echo "Copying FNA/FASTA files to $OUTPUT_DIR..."
  cp -v $FNA_FILES "$OUTPUT_DIR/" || {
    echo "Error: Failed to copy FNA/FASTA files"
    exit 1
  }
else
  echo "Warning: No FNA/FASTA files found in $PROKKA_PATH"
fi

# Copy FFN files
if [ -n "$FFN_FILES" ]; then
  echo "Copying FNA/FASTA files to $OUTPUT_DIR..."
  cp -v $FFN_FILES "$OUTPUT_DIR/" || {
    echo "Error: Failed to copy FFN files"
    exit 1
  }
else
  echo "Warning: No FFN files found in $PROKKA_PATH"
fi

# Process each GFF file to extract CDS names
echo "Extracting CDS names from GFF files..."
for gff in "$OUTPUT_DIR"/*.gff; do
  # Skip if not a valid file
  [ -f "$gff" ] || continue

  # Get GFF filename without extension
  gff_basename=$(basename "$gff" .gff)

  # Define output CDS filename
  cds_output="$OUTPUT_DIR/${gff_basename}_cds_name.txt"

  # Extract CDS IDs
  sed -n '/##FASTA/q;p' "$gff" | \
  awk '$3=="CDS"' | \
  awk '{print $9}' | \
  awk 'gsub(";.*","")' | \
  awk 'gsub("ID=","")' > "$cds_output" || {
    echo "Error: Failed to process $gff"
    exit 1
  }

  echo "Generated: $cds_output"
done

# Run R script with the output directory as input
echo "Running growth prediction R script..."
Rscript ./scripts/GrowthPredict.R "$OUTPUT_DIR" || {
  echo "Error: R script execution failed"
  exit 1
}

echo "All steps completed successfully!"