first commit
This commit is contained in:
97
public/scripts/pHmodel.py
Normal file
97
public/scripts/pHmodel.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import argparse
|
||||
import json # Add json module for output
|
||||
|
||||
# Define the gene order (retained from original logic)
|
||||
gene_order = [
|
||||
'X5.FTHF_cyc.lig', 'AAA_25', 'AAA_assoc_C', 'AcetylCoA_hyd_C', 'ASH',
|
||||
'Big_3_5', 'CitMHS', 'CpsB_CapC', 'CpXC', 'CsbD', 'Cys_rich_CPXG', 'Cytidylate_kin2',
|
||||
'CytoC_RC', 'DHquinase_I', 'Exo_endo_phos', 'FAD_binding_7', 'Glucodextran_N', 'GWxTD_dom',
|
||||
'HipA_2', 'HTH_37', 'HupF_HypC', 'HycI', 'HypD', 'Ig_GlcNase', 'KdpA', 'KdpC', 'KdpD',
|
||||
'Lactonase', 'MCRA', 'Met_gamma_lyase', 'Methyltransf_4', 'MgtE', 'MNHE', 'MrpF_PhaF',
|
||||
'OprB', 'Paired_CXXCH_1', 'PDZ_2', 'PGI', 'PhaG_MnhG_YufB', 'Phenol_MetA_deg',
|
||||
'Phosphoesterase', 'Polbeta', 'PQQ', 'Pro.kuma_activ', 'SelO', 'SNase', 'SOUL', 'TctA',
|
||||
'TelA', 'TFR_dimer', 'TPP_enzyme_C', 'TrbI', 'UvdE', 'WXXGXW', 'YHS', 'zf.CDGSH'
|
||||
]
|
||||
|
||||
def main():
|
||||
# Parse command line arguments
|
||||
parser = argparse.ArgumentParser(description='Predict pH preference based on Pfam annotations.')
|
||||
parser.add_argument('pfam_path', type=str, help='Path to the directory containing Pfam annotation files (txt format)')
|
||||
parser.add_argument('output_dir', type=str, help='Directory for temporary operations (not used for saving results)')
|
||||
parser.add_argument('--model_path', type=str, default='/app/scripts/pH_model_2022-10-17_56_full.sav',
|
||||
help='Path to the pre-trained model file (default: /app/scripts/pH_model_2022-10-17_56_full.sav)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate input directory exists
|
||||
if not os.path.isdir(args.pfam_path):
|
||||
raise ValueError(f"Input directory not found: {args.pfam_path}")
|
||||
|
||||
# Create output directory if it doesn't exist (for potential temporary files)
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Initialize matrix with gene_order as columns
|
||||
matrix = pd.DataFrame(columns=gene_order)
|
||||
|
||||
# Read and process Pfam files
|
||||
print("Processing Pfam annotation files...", file=sys.stderr) # Print logs to stderr
|
||||
for pfam_file in tqdm(os.listdir(args.pfam_path), file=sys.stderr):
|
||||
# Filter files with correct suffix and prefix
|
||||
if pfam_file.endswith('.txt') and pfam_file.startswith('filtered_pfam_list'):
|
||||
file_path = os.path.join(args.pfam_path, pfam_file)
|
||||
|
||||
# Read gene list (ensure first column is read and deduplicated)
|
||||
try:
|
||||
# Assume file is whitespace-separated with no header; read first column
|
||||
gene_df = pd.read_table(file_path, header=None)
|
||||
gene_list = gene_df[0].drop_duplicates().tolist()
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to process {pfam_file}, skipping. Error: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Construct row data (1 if gene exists, 0 otherwise)
|
||||
row_data = {col: 1 if col in gene_list else 0 for col in gene_order}
|
||||
|
||||
# Extract row name (based on filename)
|
||||
row_name = pfam_file.split('filtered_pfam_list_')[1].split('.txt', 1)[0]
|
||||
matrix.loc[row_name] = row_data
|
||||
|
||||
# Check if matrix is empty
|
||||
if matrix.empty:
|
||||
raise ValueError("No valid Pfam files found. Please check the input directory and file naming.")
|
||||
|
||||
# Load pre-trained model
|
||||
print("Loading pre-trained model...", file=sys.stderr)
|
||||
if not os.path.isfile(args.model_path):
|
||||
raise FileNotFoundError(f"Model file not found: {args.model_path}")
|
||||
with open(args.model_path, 'rb') as f:
|
||||
model = pickle.load(f)
|
||||
|
||||
# Prepare features
|
||||
X = matrix # Features
|
||||
|
||||
# Perform prediction
|
||||
print("Performing pH prediction...", file=sys.stderr)
|
||||
pred = model.predict(X)
|
||||
|
||||
# Construct result DataFrame
|
||||
result = pd.DataFrame(data=pred, columns=['pHpred'])
|
||||
result['genome'] = X.index # Genome names (from row indices)
|
||||
|
||||
# Save result to output directory
|
||||
output_file = os.path.join(args.output_dir, 'pHpredict.csv')
|
||||
result.to_csv(output_file, index=False)
|
||||
|
||||
# Construct result dictionary: {genome_name: prediction}
|
||||
# result_dict = dict(zip(X.index, pred)) # X.index is the genome identifier
|
||||
|
||||
# Output JSON (only this line is printed to stdout)
|
||||
# print(json.dumps(result_dict))
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys # Import sys for stderr logging
|
||||
main()
|
||||
Reference in New Issue
Block a user