first commit

This commit is contained in:
gzy
2025-12-16 11:39:15 +08:00
commit a3bdbee7c2
118 changed files with 34631 additions and 0 deletions

97
public/scripts/pHmodel.py Normal file
View File

@@ -0,0 +1,97 @@
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
import pickle
import argparse
import json # Add json module for output
# Define the gene order (retained from original logic)
gene_order = [
'X5.FTHF_cyc.lig', 'AAA_25', 'AAA_assoc_C', 'AcetylCoA_hyd_C', 'ASH',
'Big_3_5', 'CitMHS', 'CpsB_CapC', 'CpXC', 'CsbD', 'Cys_rich_CPXG', 'Cytidylate_kin2',
'CytoC_RC', 'DHquinase_I', 'Exo_endo_phos', 'FAD_binding_7', 'Glucodextran_N', 'GWxTD_dom',
'HipA_2', 'HTH_37', 'HupF_HypC', 'HycI', 'HypD', 'Ig_GlcNase', 'KdpA', 'KdpC', 'KdpD',
'Lactonase', 'MCRA', 'Met_gamma_lyase', 'Methyltransf_4', 'MgtE', 'MNHE', 'MrpF_PhaF',
'OprB', 'Paired_CXXCH_1', 'PDZ_2', 'PGI', 'PhaG_MnhG_YufB', 'Phenol_MetA_deg',
'Phosphoesterase', 'Polbeta', 'PQQ', 'Pro.kuma_activ', 'SelO', 'SNase', 'SOUL', 'TctA',
'TelA', 'TFR_dimer', 'TPP_enzyme_C', 'TrbI', 'UvdE', 'WXXGXW', 'YHS', 'zf.CDGSH'
]
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Predict pH preference based on Pfam annotations.')
parser.add_argument('pfam_path', type=str, help='Path to the directory containing Pfam annotation files (txt format)')
parser.add_argument('output_dir', type=str, help='Directory for temporary operations (not used for saving results)')
parser.add_argument('--model_path', type=str, default='/app/scripts/pH_model_2022-10-17_56_full.sav',
help='Path to the pre-trained model file (default: /app/scripts/pH_model_2022-10-17_56_full.sav)')
args = parser.parse_args()
# Validate input directory exists
if not os.path.isdir(args.pfam_path):
raise ValueError(f"Input directory not found: {args.pfam_path}")
# Create output directory if it doesn't exist (for potential temporary files)
os.makedirs(args.output_dir, exist_ok=True)
# Initialize matrix with gene_order as columns
matrix = pd.DataFrame(columns=gene_order)
# Read and process Pfam files
print("Processing Pfam annotation files...", file=sys.stderr) # Print logs to stderr
for pfam_file in tqdm(os.listdir(args.pfam_path), file=sys.stderr):
# Filter files with correct suffix and prefix
if pfam_file.endswith('.txt') and pfam_file.startswith('filtered_pfam_list'):
file_path = os.path.join(args.pfam_path, pfam_file)
# Read gene list (ensure first column is read and deduplicated)
try:
# Assume file is whitespace-separated with no header; read first column
gene_df = pd.read_table(file_path, header=None)
gene_list = gene_df[0].drop_duplicates().tolist()
except Exception as e:
print(f"Warning: Failed to process {pfam_file}, skipping. Error: {e}", file=sys.stderr)
continue
# Construct row data (1 if gene exists, 0 otherwise)
row_data = {col: 1 if col in gene_list else 0 for col in gene_order}
# Extract row name (based on filename)
row_name = pfam_file.split('filtered_pfam_list_')[1].split('.txt', 1)[0]
matrix.loc[row_name] = row_data
# Check if matrix is empty
if matrix.empty:
raise ValueError("No valid Pfam files found. Please check the input directory and file naming.")
# Load pre-trained model
print("Loading pre-trained model...", file=sys.stderr)
if not os.path.isfile(args.model_path):
raise FileNotFoundError(f"Model file not found: {args.model_path}")
with open(args.model_path, 'rb') as f:
model = pickle.load(f)
# Prepare features
X = matrix # Features
# Perform prediction
print("Performing pH prediction...", file=sys.stderr)
pred = model.predict(X)
# Construct result DataFrame
result = pd.DataFrame(data=pred, columns=['pHpred'])
result['genome'] = X.index # Genome names (from row indices)
# Save result to output directory
output_file = os.path.join(args.output_dir, 'pHpredict.csv')
result.to_csv(output_file, index=False)
# Construct result dictionary: {genome_name: prediction}
# result_dict = dict(zip(X.index, pred)) # X.index is the genome identifier
# Output JSON (only this line is printed to stdout)
# print(json.dumps(result_dict))
if __name__ == "__main__":
import sys # Import sys for stderr logging
main()