diff --git a/.github/workflows/cml.yml b/.github/workflows/cml.yml
new file mode 100644
index 00000000..bf361dd3
--- /dev/null
+++ b/.github/workflows/cml.yml
@@ -0,0 +1,38 @@
+name: model-training
+on:
+ push:
+ branches:
+ - mlmodel
+permissions: write-all
+jobs:
+ run:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v4
+ - uses: iterative/setup-cml@v1
+ - name: Train model
+ env:
+ REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: |
+ ls
+ cd model_data/simulation_system
+ pip install -r requirements.txt
+ python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
+
+ cd model_directory/RDSAP_CHANGE
+ echo "## Model metrics" > report.md
+ metrics_location=$(find . -maxdepth 10 -name "metrics.md")
+ echo $metrics_location
+ cat $metrics_location >> report.md
+
+ # echo "## Residuals plot from model" >> report.md
+ # metrics_location=$(find . -maxdepth 10 -name "residuals.png")
+ # echo $metrics_location
+ # cd $metric_location
+ # echo "" >> report.md
+
+ cml comment create report.md
+
+ # cml comment create --log debug --publish false report.md
+
diff --git a/.gitignore b/.gitignore
index cb17846e..be9da3aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,6 +252,7 @@ backend/.idea
open_uprn/.idea/
conservation_areas/.idea/
model_data/.idea/
+model_data/simulation_system/.idea/
model_data/simulation_system/data*
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..0ded8e60 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ae87bfde 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
-
+
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/MLModel/BaseMLModel.py b/model_data/simulation_system/MLModel/BaseMLModel.py
new file mode 100644
index 00000000..42106a33
--- /dev/null
+++ b/model_data/simulation_system/MLModel/BaseMLModel.py
@@ -0,0 +1,59 @@
+"""
+BaseMLModel class
+This is the base protocol:
+- Any implementation will be its own seperate file
+Key tasks:
+- Template Model class for different model types
+- Save model
+- Load Model
+- Generate Inference
+"""
+
+from pathlib import Path
+from typing import Protocol, NamedTuple
+import pandas as pd
+
+
+class MLModel(Protocol):
+ '''
+ Base ML Model protocol
+ '''
+
+ def load_model(self, filepath: Path) -> None:
+ """
+ Providing a path, this function will load the model to be used. Will load to internal variable
+ """
+
+ def save_model(self, output_filepath: Path) -> None:
+ """
+ Providing a path, this function will save the model to be used.
+ """
+
+ def train_model(
+ self,
+ data: pd.DataFrame,
+ target_column: str,
+ hyperparameter: dict
+ ) -> None:
+ """
+ For the given data and hyperparameters (specified to the model), a model is trained
+ """
+
+ def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+ """
+ For the given dataframe, model is loaded and predictions are generated
+ """
+
+ def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
+ """
+ For any validation data, a set of predictions and metrics are return
+ """
+
+ def optimise_model_for_deployment(self):
+ """
+ Perfomance post processing on Model to ensure ready for deployment
+ """
+
+ def generate_meta_data(self):
+ """
+ """
diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
new file mode 100644
index 00000000..ccf6fdf8
--- /dev/null
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -0,0 +1,136 @@
+"""
+Different implementations of the MLModel Protocol
+Uses the BaseMLModel protocol
+Key tasks:
+- Template Model class for different model types
+- Save model
+- Load Model
+- Generate Inference
+"""
+
+from typing import NamedTuple
+from pathlib import Path
+import pandas as pd
+from autogluon.tabular import TabularDataset, TabularPredictor
+from sklearn.metrics import mean_absolute_percentage_error
+from model_data.simulation_system.core.Logger import logger
+
+AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
+METRIC_FILENAME = "metrics.csv"
+
+
+class AutogluonModel:
+ """
+ Autogluon model that implements the MLModel Protocol
+ """
+
+ def __init__(self, output_filepath: Path = None) -> None:
+ self.model = None
+ self.output_filepath = output_filepath
+ self.predictions = None
+
+ def load_model(self, filepath: Path) -> None:
+ """
+ Providing a path, this function will load the model to be used. Will load to internal variable
+ """
+ self.model = TabularPredictor.load(path=filepath)
+
+ def save_model(self, output_filepath: Path = None) -> None:
+ """
+ Providing a path, this function will save the model to be used.
+ """
+ logger.info("Using AutoGluon Model - Model saving already occured")
+
+ def train_model(
+ self,
+ data: pd.DataFrame,
+ target_column: str,
+ hyperparameters: dict = None) -> None:
+ """
+ For the given data and hyperparameters, a model is trained
+ """
+ if self.output_filepath is None:
+ logger.error("Please specify a output_filepath in order to train a model")
+ exit(1)
+
+ if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
+ print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
+ exit(1)
+
+ AGdata = TabularDataset(data=data)
+
+ self.model = TabularPredictor(
+ label=target_column,
+ path=self.output_filepath,
+ problem_type=hyperparameters['problem_type'],
+ eval_metric=hyperparameters['eval_metric']
+ ).fit(
+ AGdata,
+ time_limit=hyperparameters['time_limit'],
+ presets=hyperparameters['presets'],
+ excluded_model_types=hyperparameters['excluded_model_types']
+ )
+
+ def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+ """
+ For the given dataframe, model is loaded and predictions are generated
+ """
+
+ if self.model is None:
+ print("No model loaded/ trained")
+ exit(1)
+
+ predictions = self.model.predict(data)
+
+ return predictions
+
+ def model_evaluation(
+ self,
+ validation_data: pd.DataFrame,
+ target_column: str,
+ metrics_location: Path = None,
+ metric_filename: str = METRIC_FILENAME
+ ) -> pd.DataFrame:
+ """
+ For any validation data, a set of predictions and metrics are return
+ """
+ if metrics_location is None:
+ logger.warning("Metrics will be outputted to current folder")
+
+ if self.model is None:
+ logger.error("No model loaded/ trained - Unable to generate evaluation")
+ exit(1)
+
+ performance = self.model.evaluate(validation_data)
+ predictions = self.generate_predictions(validation_data)
+
+ logger.info("Prediction used for evaluations are saved in self.prediction")
+ self.predictions = predictions
+
+ # TODO: Can have a custom metric class that defines all different metrics we want
+ metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
+
+ performance['mape'] = metric_mape
+
+ logger.info("Saving metric file as metric.csv")
+ metrics_location.mkdir(exist_ok=True)
+
+ metrics_df = pd.DataFrame([performance])
+ metrics_df.to_csv(metrics_location / metric_filename)
+ markdown_filename = metric_filename.split(".")[0] + ".md"
+ metrics_df.to_markdown(metrics_location / markdown_filename)
+
+ return metrics_df
+
+ def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
+ """
+ We can optimise the deployment for a autogluon model
+ """
+ if self.model is None:
+ raise ValueError("No model to optimise for deployment")
+
+ if deployment_path is None:
+ raise ValueError("Deployment path required")
+
+ # This will return a string path of the location
+ return self.model.clone_for_deployment(deployment_path)
diff --git a/model_data/simulation_system/MLModel/__init__.py b/model_data/simulation_system/MLModel/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/Makefile b/model_data/simulation_system/Makefile
new file mode 100644
index 00000000..97df8d3e
--- /dev/null
+++ b/model_data/simulation_system/Makefile
@@ -0,0 +1,14 @@
+.PHONY: init
+init: build docker
+
+.PHONY: build
+build:
+ docker-compose build
+
+.PHONY: docker
+docker:
+ docker-compose up -d
+
+.PHONY: down
+down:
+ docker compose down
\ No newline at end of file
diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md
new file mode 100644
index 00000000..b6fe8327
--- /dev/null
+++ b/model_data/simulation_system/README.md
@@ -0,0 +1,66 @@
+# Simulation System
+
+Starter Readme:
+Steps for pipeline:
+
+- (WIP) Use Makefile to start up mock up s3 service
+ - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
+ - This docker compose is running in detached mode `-d`, so will no output anything to the terminal
+
+- Once the Minio service is run, you can run the `training.py` file to start a model build process
+ - This will output a model, for a given target column, and add model name composed of some of the hyperparameters
+ - An example of running this file is:
+ - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
+ - Outputs of the pipeline are:
+ - A model directory bucket
+ - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+ - A model type prefix (i.e. autogluon, tensorflow etc)
+ - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
+ - This model name is made up of target variable, quality, time spent training and timestamp
+ - Within this prefix, there are three folders:
+ - model
+ - The model path that can be loaded in the codebase
+ - deployment
+ - The optimised model that can be deployed (may or maynot need this)
+ - metrics
+ - The metrics generatted from the model (may or may not need this as this can be contained in the registry)
+
+- Once model build is finished, you can run the `prediction.py` file to generate prediction
+ - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
+ - This can be overwritten by specifying a model_path, which will load an alternative model
+ - There are two ways of getting data into the pipeline:
+ - Using the `--data` argument:
+ - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
+ - Note the single and double quotation marks, as this affects the ingestion
+ - Using the `--data-path` argument:
+ - This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
+ - An example of running the file is:
+ - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
+ - Outputs of the pipeline are:
+ - prediction bucket
+ - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+ - a uprn prefix (i.e 0123456789)
+ - a `prediction.json`
+ - a `metadata.json`
+ - This is all the metadata from the model (can change this if needed)
+
+- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
+ - It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
+ - I.e. the hyperparameters for models are in here but will move into a separate configuration file
+
+
+# TODO:
+- Structure/ MLOps:
+ - Add configuration files (dev, staging, prod), including hyperparamters
+ - Add precommit hooks (linters, branch names, etc)
+ - Sphinx documentation
+ - Sort out local mock up services
+ - Sort out Model Registry
+ - Sort out Data version control
+- Data Science:
+ - Implement a metrics class, to hold all metric
+ - Rebuild metrics script (Could be a one off but good to have)
+ - Determine metrics
+ - Implement and test custom model (Tensorflow Decision Trees etc)
+- Orchestration:
+ - Lambda handler for the pipeline
\ No newline at end of file
diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py
new file mode 100644
index 00000000..dcd7af16
--- /dev/null
+++ b/model_data/simulation_system/core/DataLoader.py
@@ -0,0 +1,25 @@
+import pandas as pd
+import os
+
+
+class DataLoader:
+
+ @staticmethod
+ def load(filepath: str, index_col: str = None) -> pd.DataFrame:
+ """
+ Load different datasets
+ """
+
+ if not os.path.exists(filepath):
+ raise FileNotFoundError(f"File not found: {filepath}")
+
+ if filepath.endswith('.parquet'):
+ df = pd.read_parquet(filepath)
+ if index_col is not None:
+ df = df.set_index(index_col)
+ elif filepath.endswith('.csv'):
+ df = pd.read_csv(filepath, index_col=index_col)
+ else:
+ raise ValueError(f"File format not supported for file: {filepath}")
+
+ return df
diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py
similarity index 99%
rename from model_data/simulation_system/DataProcessor.py
rename to model_data/simulation_system/core/DataProcessor.py
index 50abd8e3..7b50f486 100644
--- a/model_data/simulation_system/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@@ -2,7 +2,7 @@ from pathlib import Path
import numpy as np
import pandas as pd
from model_data.BaseUtility import Definitions
-from simulation_system.Settings import (
+from simulation_system.core.Settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,
@@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
+ self.data = None
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py
new file mode 100644
index 00000000..cefcee9b
--- /dev/null
+++ b/model_data/simulation_system/core/FeatureProcessor.py
@@ -0,0 +1,70 @@
+"""
+Create additional features from the dataset
+"""
+
+import pandas as pd
+from typing import List
+from model_data.simulation_system.core.Logger import logger
+
+RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
+HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
+
+RANDOM_SEED = 0
+
+
+class FeatureProcessor:
+ """
+ Handle all feature manipulation before modelling
+ """
+
+ @staticmethod
+ def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
+ """
+ Remove the unused columns for RDS
+ """
+ if target_column == "RDSAP_CHANGE":
+ df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
+ elif target_column == "HEAT_DEMAND_CHANGE":
+ df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
+ return df
+
+ @staticmethod
+ def retain_features(df: pd.DataFrame, features: List[str] = None):
+ """
+ Determine which columns to keep for modelling
+ """
+ if features is None:
+ features = df.columns
+ else:
+ if not set(features).issubset(df.columns):
+ logger.error('Features defined is not contained in data')
+ exit(1)
+
+ df = df[features]
+
+ return df
+
+ @staticmethod
+ def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
+ """
+ Sample data to reduce number of rows for model building if needed
+ """
+
+ if subsample_amount:
+ df = df.sample(subsample_amount, random_state=RANDOM_SEED)
+ return df
+
+ def process(
+ self,
+ df: pd.DataFrame,
+ target_column: str = "RDSAP_CHANGE",
+ features: List[str] = None,
+ subsample_amount: int = None
+ ) -> pd.DataFrame:
+ """
+ Pipeline to get data ready for building a model
+ """
+ df = self.subsample_data(df, subsample_amount=subsample_amount)
+ df = self.drop_unused_columns(df, target_column=target_column)
+ df = self.retain_features(df, features=features)
+ return df
diff --git a/model_data/simulation_system/Logger.py b/model_data/simulation_system/core/Logger.py
similarity index 89%
rename from model_data/simulation_system/Logger.py
rename to model_data/simulation_system/core/Logger.py
index 5197e7ce..8603fff6 100644
--- a/model_data/simulation_system/Logger.py
+++ b/model_data/simulation_system/core/Logger.py
@@ -1,3 +1,7 @@
+"""
+Logger that will be used throughout the application
+"""
+
import logging
def setup_logger():
diff --git a/model_data/simulation_system/Settings.py b/model_data/simulation_system/core/Settings.py
similarity index 78%
rename from model_data/simulation_system/Settings.py
rename to model_data/simulation_system/core/Settings.py
index 1d302abf..e562a39b 100644
--- a/model_data/simulation_system/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@@ -1,5 +1,34 @@
# Using a simply python file as settings for now
# TODO: migrate to dynaconf
+from pathlib import Path
+
+# Can move to a hyperparmeters file
+# If anything we might want to have a file that can be loaded and sent to this script
+MODEL_HYPERPARAMETERS = {
+ "autogluon": {
+ 'problem_type': 'regression',
+ 'eval_metric': 'mean_absolute_error',
+ 'time_limit': 30,
+ 'presets': 'medium_quality',
+ 'excluded_model_types': None
+ }
+}
+
+RANDOM_SEED = 0
+SUBSAMPLE_FACTOR = 200
+
+TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
+TEST_DATA_NAME = 'test_data.parquet'
+
+REGISTRY_FILE = "model_registry.csv"
+MODEL_DIRECTORY = "model_directory"
+BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
+PREDICTION_LOCATION = Path("predictions")
+PREDICTION_FILE = 'prediction.json'
+METADATA_FILE = 'metadata.json'
+MODEL_FOLDER = "model"
+METRICS_FOLDER = "metrics"
+DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
diff --git a/model_data/simulation_system/core/__init__.py b/model_data/simulation_system/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/docker-compose.yml b/model_data/simulation_system/docker-compose.yml
new file mode 100644
index 00000000..55f181bc
--- /dev/null
+++ b/model_data/simulation_system/docker-compose.yml
@@ -0,0 +1,17 @@
+version: '3'
+
+services:
+ minio:
+ image: minio/minio
+ ports:
+ - "9000:9000"
+ - "9001:9001"
+ volumes:
+ - ./data:/data
+ environment:
+ MINIO_ROOT_USER: &MINIO_USER admin
+ MINIO_ROOT_PASSWORD: &MINIO_PASS password
+ command: server --console-address ":9001" /data
+
+# volumes:
+# minio_storage: {}
\ No newline at end of file
diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py
index 87ad3799..ab29f39c 100644
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@@ -1,5 +1,5 @@
from pathlib import Path
-from Settings import (
+from core.Settings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,
diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/generate_rdsap_change.py
similarity index 95%
rename from model_data/simulation_system/app.py
rename to model_data/simulation_system/generate_rdsap_change.py
index 9ac2c13d..2400e7c7 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@@ -1,24 +1,24 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
-from model_data.BaseUtility import Definitions
+
from pathlib import Path
-from model_data.simulation_system.Settings import (
+from core.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
- COLUMNS_TO_MERGE_ON,
- FLOOR_LEVEL_MAP,
- BUILT_FORM_REMAP
+ COLUMNS_TO_MERGE_ON
)
-from DataProcessor import DataProcessor
+from core.DataProcessor import DataProcessor
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
+# TODO: Have a look at temporal features
+
def app():
# Get all the files in the directory
@@ -85,9 +85,6 @@ def app():
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
- if len(vals) == 0:
- wrong_var
-
fixed_data[field] = np.mean(vals)
# Combine all fields together
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
new file mode 100644
index 00000000..aa6c2d0f
--- /dev/null
+++ b/model_data/simulation_system/predictions.py
@@ -0,0 +1,139 @@
+"""
+Script to load MLModel class and generate predictions
+"""
+
+import json
+import argparse
+from model_data.simulation_system.MLModel.Models import AutogluonModel
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
+import pandas as pd
+from typing import Optional
+from datetime import datetime
+from model_data.simulation_system.core.Settings import (
+ BASE_REGISTRY_PATH,
+ REGISTRY_FILE,
+ PREDICTION_LOCATION,
+ PREDICTION_FILE,
+ METADATA_FILE
+)
+
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
+
+# FOR TESTING
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
+# DataFrame)
+# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
+# DATA = TEST_DATA.sample(1)
+
+
+def ingest_arguments() -> argparse.Namespace:
+ """
+ Helper function to take in arguments from script start
+ """
+
+ parser = argparse.ArgumentParser(description='Inputs for training script')
+ parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
+ choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
+ parser.add_argument('--model-path', type=str,
+ help='If you wish to use a specific model, specify the model path here')
+ parser.add_argument('--data', type=str, help='Json data for predictions')
+ parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
+
+ args = parser.parse_args()
+
+ return args
+
+
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
+ data_path: Optional[str] = None):
+ """
+ Main pipeline function
+ """
+
+ registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
+
+ if registry_path is None or not registry_path.exists():
+ logger.error("No registry path provided or registry doesn't exist")
+ exit(1)
+
+ if model_path is not None:
+ logger.info("User specified a model to load - ignoring registry")
+ model_location = model_path
+ model_type = model_path
+ model_name = model_path
+ else:
+ # TODO: Think about where registry will sit/ type
+ logger.info("Loading best model from registry")
+ registry_df = pd.read_csv(registry_path)
+ best_model_df = registry_df[registry_df['best_model']]
+
+ model_location = best_model_df['model_location'].values[0]
+ model_type = best_model_df['model_type'].values[0]
+ model_name = best_model_df['model_name'].values[0]
+
+ logger.info("--- Model Info: ---")
+ logger.info(f"Model type: {model_type}")
+ logger.info(f"Model name: {model_name}")
+ logger.info(f"Model location: {model_location}")
+
+ logger.info("--- Loading Data ---")
+ if data is None and data_path is None:
+ logger.error("No Data/Data Path passed")
+ exit(1)
+ if data_path and data is None:
+ logger.info("Loading data from provided path")
+ data = DataLoader().load(filepath=data_path, index_col="UPRN")
+
+ # TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
+ data = data.sample(1)
+ else:
+ logger.info('Using data provided')
+ data = json.loads(data)
+ data = pd.DataFrame([data])
+ print(data)
+
+ logger.info("--- Loading Model ---")
+ model = AutogluonModel()
+
+ model.load_model(filepath=model_location)
+
+ logger.info("--- Generating Predictions ---")
+ prediction = model.generate_predictions(data=data)
+
+ # Save prediction some where?
+ # prediction.to_csv("s3?")
+
+ # TODO: Check how we want to structure outputs
+ # For now, just categorise by uprn and timestamp
+ # Assume one uprn coming in for now
+ uprn = data.index.values[0]
+
+ # Saving prediction local for now
+ # TODO: change uprn to TARGET_ID, put in setting
+ logger.info("--- Outputting prediction and metadata --- ")
+ output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
+ output_base.mkdir(parents=True, exist_ok=True)
+
+ # TODO: change model.model.info to a class method for MLModel
+ json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
+ prediction_metadata = {
+ "model_type": model_type,
+ "model_name": model_name,
+ "model_location": model_location,
+ "model_settings": model.model.info()
+ }
+
+ pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
+
+ return json_prediction
+
+
+if __name__ == "__main__":
+ args = ingest_arguments()
+
+ # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
+ # Data path can be passed as so: python3 predictions.py --data-path
+ # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
+ prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
diff --git a/model_data/simulation_system/requirements.txt b/model_data/simulation_system/requirements.txt
new file mode 100644
index 00000000..62db1719
--- /dev/null
+++ b/model_data/simulation_system/requirements.txt
@@ -0,0 +1,208 @@
+absl-py==1.4.0
+accelerate==0.16.0
+aiohttp==3.8.5
+aiohttp-cors==0.7.0
+aiosignal==1.3.1
+aliyun-python-sdk-core==2.13.36
+aliyun-python-sdk-kms==2.16.1
+antlr4-python3-runtime==4.9.3
+asttokens==2.2.1
+async-timeout==4.0.3
+attrs==23.1.0
+autogluon==0.8.2
+autogluon.common==0.8.2
+autogluon.core==0.8.2
+autogluon.features==0.8.2
+autogluon.multimodal==0.8.2
+autogluon.tabular==0.8.2
+autogluon.timeseries==0.8.2
+backcall==0.2.0
+beautifulsoup4==4.12.2
+blessed==1.20.0
+blis==0.7.10
+boto3==1.28.25
+botocore==1.31.25
+cachetools==5.3.1
+catalogue==2.0.9
+catboost==1.2
+certifi==2023.7.22
+cffi==1.15.1
+charset-normalizer==3.2.0
+click==8.1.6
+cloudpickle==2.2.1
+colorama==0.4.6
+colorful==0.5.5
+comm==0.1.4
+confection==0.1.1
+contourpy==1.1.0
+crcmod==1.7
+cryptography==41.0.3
+cycler==0.11.0
+cymem==2.0.7
+datasets==2.14.4
+debugpy==1.6.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.7
+distlib==0.3.7
+evaluate==0.3.0
+executing==1.2.0
+fastai==2.7.12
+fastcore==1.5.29
+fastdownload==0.0.7
+fastprogress==1.0.3
+filelock==3.12.2
+fonttools==4.42.0
+frozenlist==1.4.0
+fsspec==2023.6.0
+future==0.18.3
+gdown==4.7.1
+gluonts==0.13.3
+google-api-core==2.11.1
+google-auth==2.22.0
+google-auth-oauthlib==1.0.0
+googleapis-common-protos==1.60.0
+gpustat==1.1
+graphviz==0.20.1
+grpcio==1.50.0
+huggingface-hub==0.16.4
+hyperopt==0.2.7
+idna==3.4
+imageio==2.31.1
+ipykernel==6.25.1
+ipython==8.14.0
+jedi==0.19.0
+Jinja2==3.1.2
+jmespath==0.10.0
+joblib==1.3.2
+jsonschema==4.17.3
+jupyter_client==8.3.0
+jupyter_core==5.3.1
+kiwisolver==1.4.4
+langcodes==3.3.0
+lightgbm==3.3.5
+lightning-utilities==0.9.0
+llvmlite==0.40.1
+Markdown==3.4.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.7.2
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+mlforecast==0.7.3
+model-index==0.1.11
+msgpack==1.0.5
+multidict==6.0.4
+multiprocess==0.70.15
+murmurhash==1.0.9
+nest-asyncio==1.5.7
+networkx==3.1
+nlpaug==1.1.11
+nltk==3.8.1
+nptyping==2.4.1
+numba==0.57.1
+numpy==1.24.4
+nvidia-ml-py==12.535.77
+oauthlib==3.2.2
+omegaconf==2.2.3
+opencensus==0.11.2
+opencensus-context==0.1.3
+opendatalab==0.0.10
+openmim==0.3.9
+openxlab==0.0.17
+ordered-set==4.1.0
+oss2==2.17.0
+packaging==23.1
+pandas==1.5.3
+parso==0.8.3
+pathy==0.10.2
+patsy==0.5.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.5.0
+platformdirs==3.10.0
+plotly==5.16.0
+preshed==3.0.8
+prometheus-client==0.17.1
+prompt-toolkit==3.0.39
+protobuf==3.20.2
+psutil==5.9.5
+ptyprocess==0.7.0
+pure-eval==0.2.2
+py-spy==0.3.14
+py4j==0.10.9.7
+pyarrow==12.0.1
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+pycryptodome==3.18.0
+pydantic==1.10.12
+Pygments==2.16.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+PySocks==1.7.1
+pytesseract==0.3.10
+python-dateutil==2.8.2
+pytorch-lightning==1.9.5
+pytorch-metric-learning==1.7.3
+pytz==2023.3
+PyWavelets==1.4.1
+PyYAML==6.0.1
+pyzmq==25.1.0
+ray==2.3.1
+regex==2023.8.8
+requests==2.28.2
+requests-oauthlib==1.3.1
+responses==0.18.0
+rich==13.4.2
+rsa==4.9
+s3transfer==0.6.1
+safetensors==0.3.2
+scikit-image==0.19.3
+scikit-learn==1.2.2
+scipy==1.11.1
+seaborn==0.12.2
+sentencepiece==0.1.99
+seqeval==1.2.2
+six==1.16.0
+smart-open==6.3.0
+soupsieve==2.4.1
+spacy==3.6.1
+spacy-legacy==3.0.12
+spacy-loggers==1.0.4
+srsly==2.4.7
+stack-data==0.6.2
+statsforecast==1.4.0
+statsmodels==0.14.0
+tabulate==0.9.0
+tenacity==8.2.2
+tensorboard==2.14.0
+tensorboard-data-server==0.7.1
+tensorboardX==2.6.2
+text-unidecode==1.3
+thinc==8.1.12
+threadpoolctl==3.2.0
+tifffile==2023.7.18
+timm==0.9.5
+tokenizers==0.13.3
+toolz==0.12.0
+torch==1.13.1
+torchmetrics==0.11.4
+torchvision==0.14.1
+tornado==6.3.2
+tqdm==4.65.1
+traitlets==5.9.0
+transformers==4.26.1
+typer==0.9.0
+typing_extensions==4.7.1
+tzdata==2023.3
+ujson==5.8.0
+urllib3==1.26.16
+virtualenv==20.24.3
+wasabi==1.1.2
+wcwidth==0.2.6
+Werkzeug==2.3.6
+window-ops==0.0.14
+xgboost==1.7.6
+xxhash==3.3.0
+yarl==1.9.2
diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt
new file mode 100644
index 00000000..f9ce32bf
--- /dev/null
+++ b/model_data/simulation_system/requirements/prediction.txt
@@ -0,0 +1,2 @@
+autogluon==0.8.2
+pandas==1.5.3
\ No newline at end of file
diff --git a/model_data/simulation_system/requirements/training.txt b/model_data/simulation_system/requirements/training.txt
new file mode 100644
index 00000000..17e4c8da
--- /dev/null
+++ b/model_data/simulation_system/requirements/training.txt
@@ -0,0 +1,3 @@
+autogluon==0.8.2
+pandas==1.5.3
+seaborn==0.12.2
diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py
index fb7d7c64..d57c90f8 100644
--- a/model_data/simulation_system/test_data_generation.py
+++ b/model_data/simulation_system/test_data_generation.py
@@ -1,9 +1,12 @@
-from Logger import logger
+from core.Logger import logger
import argparse
import pandas as pd
from pathlib import Path
-
-RANDOM_SEED = 0
+from core.Settings import (
+ RANDOM_SEED,
+ TRAIN_AND_VALIDATION_DATA_NAME,
+ TEST_DATA_NAME
+)
def ingest_arguments() -> argparse.Namespace:
"""
@@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
logger.info('--- Saving data ---')
- train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
- test_data.to_parquet(Path(output_folder)/'test_data.parquet')
+ train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
+ test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)
logger.info(' ---Pipeline complete---')
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index da2c6f4a..2a1dfcfa 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,19 +1,47 @@
-import os
-import pandas as pd
import argparse
-from typing import List
-from Logger import logger
-from autogluon.tabular import TabularDataset, TabularPredictor
+# import boto3
+from pathlib import Path
+from datetime import datetime
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
+from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
+from model_data.simulation_system.MLModel.Models import AutogluonModel
+import pandas as pd
+from model_data.simulation_system.core.Settings import (
+ MODEL_DIRECTORY,
+ BASE_REGISTRY_PATH,
+ REGISTRY_FILE,
+ MODEL_FOLDER,
+ METRICS_FOLDER,
+ DEPLOYMENT_FOLDER,
+ SUBSAMPLE_FACTOR,
+ MODEL_HYPERPARAMETERS
+)
+import seaborn as sns
+import matplotlib.pyplot as plt
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
-FEATURE_COLUMNS = None
-RANDOM_SEED = 0
# FOR TESTING
-train_filepath = "./model_build_data/train_validation_data.parquet"
-test_filepath = "./model_build_data/test_data.parquet"
+# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
+# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
+# target_column = "RDSAP_CHANGE"
+# model_type = "autogluon"
+# hyperparameter = HYPERPARAMETERS
+# SUBSAMPLE_FACTOR = 200
+# SESSION = boto3.Session()
+
+# S3_CLIENT = SESSION.client(
+# service_name="s3",
+# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
+# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
+# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
+# )
+
+# S3_CLIENT.create_bucket
+# S3_CLIENT.list_buckets()
def ingest_arguments() -> argparse.Namespace:
"""
@@ -22,122 +50,168 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
- parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
- parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
+ parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
+ required=True)
+ parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
+ required=True)
+ parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
+ default="autogluon")
+ parser.add_argument('--target-column', type=str, help='The response variable',
+ choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
-class DataLoader():
-
- @staticmethod
- def load(filepath: str) -> pd.DataFrame:
- """
- Load different datasets
- """
- if filepath.endswith('.parquet'):
- df = pd.read_parquet(filepath)
- elif filepath.endswith('.csv.'):
- df = pd.read_csv(filepath)
- else:
- logger.error('Not implemented!')
- exit(1)
-
- return df
-
-class FeatureProcessor:
- """
- Handle all feature manipulation before modelling
- """
-
- @staticmethod
- def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
- df = df.drop(columns=[drop_columns])
- return df
-
- def retain_features(df: pd.DataFrame, features: List[str] = None):
- """
- Determine which columns to keep ofr modelling
- """
- if features is None:
- features = df.columns
- else:
- if not set(features).issubset(df.columns):
- logger.error('Features defined is not contained in data')
- exit(1)
-
- df = df[features]
-
- return df
-
- def process(self, df: pd.DataFrame) -> pd.DataFrame:
- df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
- df = self.retain_features(df, features=FEATURE_COLUMNS)
- return df
-
-
-
-def training(train_filepath: str, test_filepath: str) -> None:
+def training(
+ train_filepath: str,
+ test_filepath: str,
+ target_column: str = "RDSAP_CHANGE",
+ model_type: str = "autogluon",
+ hyperparameters: dict = None
+) -> None:
"""
Pipeline to run training on the dataset
"""
- logger.info('Loading data')
+ logger.info('--- Loading data ---')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
- # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
-
- logger.info('Feature processing')
+ logger.info('--- Feature processing ---')
+
feature_processor = FeatureProcessor()
- train_df = feature_processor.process(train_df)
- test_df = feature_processor.process(test_df)
- # logger.info('Split data into train and validation')
+ subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
- logger.info('Build Model')
-
- data = TabularDataset(data=train_filepath)
- data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
- TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
- # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
+ train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
+ test_df = feature_processor.process(test_df, target_column=target_column)
- data = data[['RDSAP_CHANGE'] + top_features.to_list()]
- # data = TabularDataset(data=train_df)
- # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
- subsample_size = round(len(data)/20)
- data = data.sample(subsample_size, random_state=RANDOM_SEED)
+ logger.info('--- Build Model ---')
- # Add custom metric class MAPE
- # Have a look at temporal features
+ logger.info("--- Load Hyperparameters ---")
- target_column = 'RDSAP_CHANGE'
- predictor_RDSAP = TabularPredictor(
- label=target_column,
- path="agModels-predictRDSAP",
- problem_type="regression",
- eval_metric='mean_absolute_error'
- ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
+ if hyperparameters is None:
+ logger.info("Use base hyperparameters in settings")
+ hyperparameters = MODEL_HYPERPARAMETERS[model_type]
+ logger.info(f'Hyperparameters are: {hyperparameters}')
+ if model_type == "autogluon":
+ model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
+ output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
+ model = AutogluonModel(
+ output_filepath=output_base / MODEL_FOLDER
+ )
+ else:
+ raise ValueError("No alternative model implemented yet")
- logger.info('Evaluate matrics')
+ model.train_model(
+ data=train_df,
+ target_column=target_column,
+ hyperparameters=hyperparameters
+ )
- test_data = TabularDataset('./model_build_data/test_data.parquet')
- performance = predictor_RDSAP.evaluate(test_data)
- predictions = predictor_RDSAP.predict(test_data)
+ logger.info("--- Save Model ---")
+ model.save_model(output_filepath=model.output_filepath)
+
+ logger.info('--- Generate evaluation metrics ---')
+ metrics_df = model.model_evaluation(
+ validation_data=test_df,
+ target_column=target_column,
+ metrics_location=output_base / METRICS_FOLDER
+ )
+
+ logger.info("--- Generate metric outputs using predictions ---")
+ # TODO: can have a model.metric_outputs method
+ # FOr not just do it here
+ residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
+
+ # image formatting
+ # TODO: move to settings file , AXIS_FONT, TITLE_FONT
+ axis_fs = 18 # fontsize
+ title_fs = 22 # fontsize
+ sns.set(style="whitegrid")
+ ax = sns.scatterplot(x="true", y="pred", data=residual_df)
+ ax.set_aspect('equal')
+ ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
+ ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
+ ax.set_title('Residuals', fontsize=title_fs)
+
+ # Square aspect ratio
+ ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
+
+ plt.tight_layout()
+ RESIDUAL_FILE = "residuals.png"
+ plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
+
+ # TODO: for cml, we might want to have class that outputs all data and plots to add to the report
+ # If we want residual plot/ any plots, we will need to self host
+ # plt.savefig(RESIDUAL_FILE, dpi=120)
+
+ # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
+ # Imagining for now that the model trained here is the best model amongst all models built
+
+ logger.info("--- Optimising model for deployment ---")
+
+ deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
+ logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
+
+ # TODO: Need a model registry - for now have this as a CSV
+ # Save this in the model directory
+ logger.info("--- Append registry with new model ---")
+ registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
+
+ if registry_path.exists():
+ logger.info("Registry file found - Loading into Dataframe")
+ registry_df = pd.read_csv(registry_path, index_col=None)
+ else:
+ # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
+ registry_df = pd.DataFrame(
+ columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
+ 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
+
+ model_details_df = pd.DataFrame(
+ [{
+ 'model_type': model_type,
+ 'model_name': model_root,
+ 'model_location': deployment_model_path
+ }]
+ )
+
+ registry_row = pd.concat([model_details_df, metrics_df], axis=1)
+ registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
+
+ # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
+ # regenerate new metrics
+ # TODO: decide metric to optimise to
+ registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
+ registry_df['best_model'] = [False] * len(registry_df)
+ registry_df.loc[0, 'best_model'] = True
+
+ logger.info("--- Saving new model to registry ---")
+ # Ensure the directory exists
+ registry_path.parent.mkdir(parents=True, exist_ok=True)
+ registry_df.to_csv(registry_path, index=False)
+
+ logger.info("--- Training Pipeline Complete --- ")
- test_data['predictions'] = predictions
- test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
-
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
- training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
\ No newline at end of file
+ # To run script: python3 training.py --train-filepath
+ # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
+ # ./model_build_data/change_data/rdsap_full/test_data.parquet
+ # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
+ training(
+ train_filepath=args.train_filepath,
+ test_filepath=args.test_filepath,
+ target_column=args.target_column,
+ model_type=args.model_type
+ )
diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py
index 9b7dbd4e..d35befd7 100644
--- a/recommendations/recommendation_utils.py
+++ b/recommendations/recommendation_utils.py
@@ -1,6 +1,7 @@
from copy import deepcopy
from backend.Property import Property
from statistics import mean
+import random
def estimate_sap_points():
@@ -9,7 +10,7 @@ def estimate_sap_points():
:return:
"""
- return 999
+ return random.sample(range(4, 12), 1)[0]
def r_value_per_mm_to_u_value(depth_mm: int, r_value_per_mm: float):