Files
labweb/public/scripts/pfam_annotation.sh
2025-12-16 11:39:15 +08:00

78 lines
2.6 KiB
Bash
Executable File

#!/bin/bash
# Script name: pfam_annotation.sh
# Function: Batch process .faa files with hmmsearch (Pfam analysis), output to pfam_predict directory
# Usage: ./pfam_annotation.sh <input_directory>
# Example: ./pfam_annotation.sh ./input_genes
# Check input parameter (must be 1: input directory containing .faa files)
if [ $# -ne 2 ]; then
echo "Error: Incorrect number of arguments!"
echo "Usage: $0 <input_directory> <output_directory>"
echo "Example: $0 ./input_genes ./ph_predict_dir"
exit 1
fi
input_dir="$1"
output_root="$2" # Fixed output directory name
# Create main output directory (pfam_predict)
echo "Creating main output directory: $output_root"
mkdir -p "$output_root" || {
echo "Error: Failed to create output directory $output_root!"
exit 1
}
# Find all .faa files in input directory (including subdirectories)
echo "Searching for .faa files in $input_dir..."
FAA_FILES=$(find "$input_dir" -type f -name "*.faa")
# Check if any .faa files were found
if [ -z "$FAA_FILES" ]; then
echo "Error: No .faa files found in $input_dir (including subdirectories)!"
exit 1
fi
# Copy all .faa files to pfam_predict directory
echo "Copying .faa files to $output_root..."
cp -v $FAA_FILES "$output_root/" || {
echo "Error: Failed to copy .faa files to $output_root!"
exit 1
}
# Process each .faa file with hmmsearch (Pfam analysis)
echo "Starting Pfam analysis with hmmsearch..."
for faa_file in "$output_root"/*.faa; do
# Skip if not a valid file (e.g., empty glob)
[ -f "$faa_file" ] || continue
# Extract sample name (remove path and .faa suffix)
sample_name=$(basename "$faa_file" .faa)
# Define output files for this sample
output_tbl="${output_root}/output_pfam_${sample_name}.tbl"
filtered_tbl="${output_root}/filtered_pfam_${sample_name}.tbl"
gene_list="${output_root}/filtered_pfam_list_${sample_name}.txt"
echo "Processing $faa_file..."
# Run hmmsearch with Pfam-A.hmm
hmmsearch --cpu 20 --tblout "$output_tbl" /mnt/Pfam-A.hmm "$faa_file" > /dev/null 2>&1
# Check if hmmsearch executed successfully
if [ $? -ne 0 ]; then
echo "Warning: hmmsearch failed for $faa_file"
continue
fi
# Filter results: full sequence score (column 6) > 10 and full sequence E-value (column 5) < 1e-5
awk '!/#/ && $6 > 10 && $5 < 1e-5' "$output_tbl" > "$filtered_tbl"
# Extract gene names (column 3)
awk '{print $3}' "$filtered_tbl" > "$gene_list"
echo "Completed processing $sample_name:"
echo " Raw results: $output_tbl"
echo " Filtered results: $filtered_tbl"
echo " Gene list: $gene_list"
done
echo "All Pfam analyses completed! Results in $output_root"