""" Script to load MLModel class and generate predictions """ import json import argparse from model_data.simulation_system.MLModel.Models import AutogluonModel from model_data.simulation_system.core.Logger import logger from model_data.simulation_system.core.DataLoader import DataLoader import pandas as pd from typing import Optional from datetime import datetime from model_data.simulation_system.core.Settings import ( BASE_REGISTRY_PATH, REGISTRY_FILE, PREDICTION_LOCATION, PREDICTION_FILE, METADATA_FILE ) TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # FOR TESTING # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to # DataFrame) # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") # DATA = TEST_DATA.sample(1) def ingest_arguments() -> argparse.Namespace: """ Helper function to take in arguments from script start """ parser = argparse.ArgumentParser(description='Inputs for training script') parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') parser.add_argument('--data', type=str, help='Json data for predictions') parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') args = parser.parse_args() return args def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): """ Main pipeline function """ registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE if registry_path is None or not registry_path.exists(): logger.error("No registry path provided or registry doesn't exist") exit(1) if model_path is not None: logger.info("User specified a model to load - ignoring registry") model_location = model_path model_type = model_path model_name = model_path else: # TODO: Think about where registry will sit/ type logger.info("Loading best model from registry") registry_df = pd.read_csv(registry_path) best_model_df = registry_df[registry_df['best_model']] model_location = best_model_df['model_location'].values[0] model_type = best_model_df['model_type'].values[0] model_name = best_model_df['model_name'].values[0] logger.info("--- Model Info: ---") logger.info(f"Model type: {model_type}") logger.info(f"Model name: {model_name}") logger.info(f"Model location: {model_location}") logger.info("--- Loading Data ---") if data is None and data_path is None: logger.error("No Data/Data Path passed") exit(1) if data_path and data is None: logger.info("Loading data from provided path") data = DataLoader().load(filepath=data_path, index_col="UPRN") # TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION data = data.sample(1) else: logger.info('Using data provided') data = json.loads(data) data = pd.DataFrame([data]) print(data) logger.info("--- Loading Model ---") model = AutogluonModel() model.load_model(filepath=model_location) logger.info("--- Generating Predictions ---") prediction = model.generate_predictions(data=data) # Save prediction some where? # prediction.to_csv("s3?") # TODO: Check how we want to structure outputs # For now, just categorise by uprn and timestamp # Assume one uprn coming in for now uprn = data.index.values[0] # Saving prediction local for now # TODO: change uprn to TARGET_ID, put in setting logger.info("--- Outputting prediction and metadata --- ") output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP output_base.mkdir(parents=True, exist_ok=True) # TODO: change model.model.info to a class method for MLModel json_prediction = prediction.to_json(output_base / PREDICTION_FILE) prediction_metadata = { "model_type": model_type, "model_name": model_name, "model_location": model_location, "model_settings": model.model.info() } pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE) return json_prediction if __name__ == "__main__": args = ingest_arguments() # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' # Data path can be passed as so: python3 predictions.py --data-path # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)