101 lines
2.6 KiB
Bash
101 lines
2.6 KiB
Bash
#!/bin/bash
|
|
# Script name: prepare_growth_prediction.sh
|
|
# Function: Copy GFF/FNA files, extract CDS names for each GFF, and run R script
|
|
# Usage: ./prepare_growth_prediction.sh <prokka_annotation_path>
|
|
# Example: ./prepare_growth_prediction.sh ./prokka_annotation
|
|
|
|
# Check input parameter
|
|
if [ $# -ne 2 ]; then
|
|
echo "Error: Incorrect number of arguments!"
|
|
echo "Usage: $0 <prokka_annotation_path> <growth_predict_dir>"
|
|
echo "Example: $0 ./prokka_annotation ./growth_predict_dir"
|
|
exit 1
|
|
fi
|
|
|
|
PROKKA_PATH="$1"
|
|
OUTPUT_DIR="$2"
|
|
|
|
# Create output directory
|
|
mkdir -p "$OUTPUT_DIR" || {
|
|
echo "Error: Failed to create $OUTPUT_DIR"
|
|
exit 1
|
|
}
|
|
|
|
# Check Prokka directory exists
|
|
if [ ! -d "$PROKKA_PATH" ]; then
|
|
echo "Error: $PROKKA_PATH does not exist"
|
|
exit 1
|
|
fi
|
|
|
|
# Find and copy GFF/FNA/FASTA files
|
|
GFF_FILES=$(find "$PROKKA_PATH" -type f -name "*.gff")
|
|
FNA_FILES=$(find "$PROKKA_PATH" -type f -name "*.fna")
|
|
FFN_FILES=$(find "$PROKKA_PATH" -type f -name "*.ffn")
|
|
|
|
# Copy GFF files
|
|
if [ -n "$GFF_FILES" ]; then
|
|
echo "Copying GFF files to $OUTPUT_DIR..."
|
|
cp -v $GFF_FILES "$OUTPUT_DIR/" || {
|
|
echo "Error: Failed to copy GFF files"
|
|
exit 1
|
|
}
|
|
else
|
|
echo "Warning: No GFF files found in $PROKKA_PATH"
|
|
exit 1 # GFF is required for CDS extraction, exit if none found
|
|
fi
|
|
|
|
# Copy FNA/FASTA files
|
|
if [ -n "$FNA_FILES" ]; then
|
|
echo "Copying FNA/FASTA files to $OUTPUT_DIR..."
|
|
cp -v $FNA_FILES "$OUTPUT_DIR/" || {
|
|
echo "Error: Failed to copy FNA/FASTA files"
|
|
exit 1
|
|
}
|
|
else
|
|
echo "Warning: No FNA/FASTA files found in $PROKKA_PATH"
|
|
fi
|
|
|
|
# Copy FFN files
|
|
if [ -n "$FFN_FILES" ]; then
|
|
echo "Copying FNA/FASTA files to $OUTPUT_DIR..."
|
|
cp -v $FFN_FILES "$OUTPUT_DIR/" || {
|
|
echo "Error: Failed to copy FFN files"
|
|
exit 1
|
|
}
|
|
else
|
|
echo "Warning: No FFN files found in $PROKKA_PATH"
|
|
fi
|
|
|
|
# Process each GFF file to extract CDS names
|
|
echo "Extracting CDS names from GFF files..."
|
|
for gff in "$OUTPUT_DIR"/*.gff; do
|
|
# Skip if not a valid file
|
|
[ -f "$gff" ] || continue
|
|
|
|
# Get GFF filename without extension
|
|
gff_basename=$(basename "$gff" .gff)
|
|
|
|
# Define output CDS filename
|
|
cds_output="$OUTPUT_DIR/${gff_basename}_cds_name.txt"
|
|
|
|
# Extract CDS IDs
|
|
sed -n '/##FASTA/q;p' "$gff" | \
|
|
awk '$3=="CDS"' | \
|
|
awk '{print $9}' | \
|
|
awk 'gsub(";.*","")' | \
|
|
awk 'gsub("ID=","")' > "$cds_output" || {
|
|
echo "Error: Failed to process $gff"
|
|
exit 1
|
|
}
|
|
|
|
echo "Generated: $cds_output"
|
|
done
|
|
|
|
# Run R script with the output directory as input
|
|
echo "Running growth prediction R script..."
|
|
Rscript ./scripts/GrowthPredict.R "$OUTPUT_DIR" || {
|
|
echo "Error: R script execution failed"
|
|
exit 1
|
|
}
|
|
|
|
echo "All steps completed successfully!" |