first commit

2025-12-16 11:39:15 +08:00
commit a3bdbee7c2
118 changed files with 34631 additions and 0 deletions
--- a/public/scripts/pHmodel.py
+++ b/public/scripts/pHmodel.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import os
+import numpy as np
+from tqdm import tqdm
+import pickle
+import argparse
+import json  # Add json module for output
+
+# Define the gene order (retained from original logic)
+gene_order = [
+    'X5.FTHF_cyc.lig', 'AAA_25', 'AAA_assoc_C', 'AcetylCoA_hyd_C', 'ASH', 
+    'Big_3_5', 'CitMHS', 'CpsB_CapC', 'CpXC', 'CsbD', 'Cys_rich_CPXG', 'Cytidylate_kin2', 
+    'CytoC_RC', 'DHquinase_I', 'Exo_endo_phos', 'FAD_binding_7', 'Glucodextran_N', 'GWxTD_dom', 
+    'HipA_2', 'HTH_37', 'HupF_HypC', 'HycI', 'HypD', 'Ig_GlcNase', 'KdpA', 'KdpC', 'KdpD', 
+    'Lactonase', 'MCRA', 'Met_gamma_lyase', 'Methyltransf_4', 'MgtE', 'MNHE', 'MrpF_PhaF', 
+    'OprB', 'Paired_CXXCH_1', 'PDZ_2', 'PGI', 'PhaG_MnhG_YufB', 'Phenol_MetA_deg', 
+    'Phosphoesterase', 'Polbeta', 'PQQ', 'Pro.kuma_activ', 'SelO', 'SNase', 'SOUL', 'TctA', 
+    'TelA', 'TFR_dimer', 'TPP_enzyme_C', 'TrbI', 'UvdE', 'WXXGXW', 'YHS', 'zf.CDGSH'
+]
+
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Predict pH preference based on Pfam annotations.')
+    parser.add_argument('pfam_path', type=str, help='Path to the directory containing Pfam annotation files (txt format)')
+    parser.add_argument('output_dir', type=str, help='Directory for temporary operations (not used for saving results)')
+    parser.add_argument('--model_path', type=str, default='/app/scripts/pH_model_2022-10-17_56_full.sav', 
+                        help='Path to the pre-trained model file (default: /app/scripts/pH_model_2022-10-17_56_full.sav)')
+    args = parser.parse_args()
+
+    # Validate input directory exists
+    if not os.path.isdir(args.pfam_path):
+        raise ValueError(f"Input directory not found: {args.pfam_path}")
+    
+    # Create output directory if it doesn't exist (for potential temporary files)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Initialize matrix with gene_order as columns
+    matrix = pd.DataFrame(columns=gene_order)
+
+    # Read and process Pfam files
+    print("Processing Pfam annotation files...", file=sys.stderr)  # Print logs to stderr
+    for pfam_file in tqdm(os.listdir(args.pfam_path), file=sys.stderr):
+        # Filter files with correct suffix and prefix
+        if pfam_file.endswith('.txt') and pfam_file.startswith('filtered_pfam_list'):
+            file_path = os.path.join(args.pfam_path, pfam_file)
+            
+            # Read gene list (ensure first column is read and deduplicated)
+            try:
+                # Assume file is whitespace-separated with no header; read first column
+                gene_df = pd.read_table(file_path, header=None)
+                gene_list = gene_df[0].drop_duplicates().tolist()
+            except Exception as e:
+                print(f"Warning: Failed to process {pfam_file}, skipping. Error: {e}", file=sys.stderr)
+                continue
+            
+            # Construct row data (1 if gene exists, 0 otherwise)
+            row_data = {col: 1 if col in gene_list else 0 for col in gene_order}
+            
+            # Extract row name (based on filename)
+            row_name = pfam_file.split('filtered_pfam_list_')[1].split('.txt', 1)[0]
+            matrix.loc[row_name] = row_data
+
+    # Check if matrix is empty
+    if matrix.empty:
+        raise ValueError("No valid Pfam files found. Please check the input directory and file naming.")
+
+    # Load pre-trained model
+    print("Loading pre-trained model...", file=sys.stderr)
+    if not os.path.isfile(args.model_path):
+        raise FileNotFoundError(f"Model file not found: {args.model_path}")
+    with open(args.model_path, 'rb') as f:
+        model = pickle.load(f)
+
+    # Prepare features
+    X = matrix  # Features
+
+    # Perform prediction
+    print("Performing pH prediction...", file=sys.stderr)
+    pred = model.predict(X)
+
+    # Construct result DataFrame
+    result = pd.DataFrame(data=pred, columns=['pHpred'])
+    result['genome'] = X.index  # Genome names (from row indices)
+
+    # Save result to output directory
+    output_file = os.path.join(args.output_dir, 'pHpredict.csv')
+    result.to_csv(output_file, index=False)
+
+    # Construct result dictionary: {genome_name: prediction}
+    # result_dict = dict(zip(X.index, pred))  # X.index is the genome identifier
+
+    # Output JSON (only this line is printed to stdout)
+    # print(json.dumps(result_dict))
+
+if __name__ == "__main__":
+    import sys  # Import sys for stderr logging
+    main()