Merge pull request #169 from Hestia-Homes/main

Model pipeline work + temp update to estimate_sap_points
This commit is contained in:
KhalimCK 2023-08-30 10:33:14 +01:00 committed by GitHub
commit af6b0e6cba
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 1003 additions and 116 deletions

38
.github/workflows/cml.yml vendored Normal file
View file

@ -0,0 +1,38 @@
name: model-training
on:
push:
branches:
- mlmodel
permissions: write-all
jobs:
run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: iterative/setup-cml@v1
- name: Train model
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
ls
cd model_data/simulation_system
pip install -r requirements.txt
python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
cd model_directory/RDSAP_CHANGE
echo "## Model metrics" > report.md
metrics_location=$(find . -maxdepth 10 -name "metrics.md")
echo $metrics_location
cat $metrics_location >> report.md
# echo "## Residuals plot from model" >> report.md
# metrics_location=$(find . -maxdepth 10 -name "residuals.png")
# echo $metrics_location
# cd $metric_location
# echo "![](./residuals.png)" >> report.md
cml comment create report.md
# cml comment create --log debug --publish false report.md

1
.gitignore vendored
View file

@ -252,6 +252,7 @@ backend/.idea
open_uprn/.idea/
conservation_areas/.idea/
model_data/.idea/
model_data/simulation_system/.idea/
model_data/simulation_system/data*

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

0
__init__.py Normal file
View file

View file

@ -0,0 +1,59 @@
"""
BaseMLModel class
This is the base protocol:
- Any implementation will be its own seperate file
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from pathlib import Path
from typing import Protocol, NamedTuple
import pandas as pd
class MLModel(Protocol):
'''
Base ML Model protocol
'''
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameter: dict
) -> None:
"""
For the given data and hyperparameters (specified to the model), a model is trained
"""
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""
def optimise_model_for_deployment(self):
"""
Perfomance post processing on Model to ensure ready for deployment
"""
def generate_meta_data(self):
"""
"""

View file

@ -0,0 +1,136 @@
"""
Different implementations of the MLModel Protocol
Uses the BaseMLModel protocol
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from typing import NamedTuple
from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error
from model_data.simulation_system.core.Logger import logger
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
METRIC_FILENAME = "metrics.csv"
class AutogluonModel:
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self, output_filepath: Path = None) -> None:
self.model = None
self.output_filepath = output_filepath
self.predictions = None
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
self.model = TabularPredictor.load(path=filepath)
def save_model(self, output_filepath: Path = None) -> None:
"""
Providing a path, this function will save the model to be used.
"""
logger.info("Using AutoGluon Model - Model saving already occured")
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
if self.output_filepath is None:
logger.error("Please specify a output_filepath in order to train a model")
exit(1)
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
exit(1)
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
label=target_column,
path=self.output_filepath,
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
AGdata,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = self.model.predict(data)
return predictions
def model_evaluation(
self,
validation_data: pd.DataFrame,
target_column: str,
metrics_location: Path = None,
metric_filename: str = METRIC_FILENAME
) -> pd.DataFrame:
"""
For any validation data, a set of predictions and metrics are return
"""
if metrics_location is None:
logger.warning("Metrics will be outputted to current folder")
if self.model is None:
logger.error("No model loaded/ trained - Unable to generate evaluation")
exit(1)
performance = self.model.evaluate(validation_data)
predictions = self.generate_predictions(validation_data)
logger.info("Prediction used for evaluations are saved in self.prediction")
self.predictions = predictions
# TODO: Can have a custom metric class that defines all different metrics we want
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
performance['mape'] = metric_mape
logger.info("Saving metric file as metric.csv")
metrics_location.mkdir(exist_ok=True)
metrics_df = pd.DataFrame([performance])
metrics_df.to_csv(metrics_location / metric_filename)
markdown_filename = metric_filename.split(".")[0] + ".md"
metrics_df.to_markdown(metrics_location / markdown_filename)
return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
raise ValueError("No model to optimise for deployment")
if deployment_path is None:
raise ValueError("Deployment path required")
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)

View file

@ -0,0 +1,14 @@
.PHONY: init
init: build docker
.PHONY: build
build:
docker-compose build
.PHONY: docker
docker:
docker-compose up -d
.PHONY: down
down:
docker compose down

View file

@ -0,0 +1,66 @@
# Simulation System
Starter Readme:
Steps for pipeline:
- (WIP) Use Makefile to start up mock up s3 service
- By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
- This docker compose is running in detached mode `-d`, so will no output anything to the terminal
- Once the Minio service is run, you can run the `training.py` file to start a model build process
- This will output a model, for a given target column, and add model name composed of some of the hyperparameters
- An example of running this file is:
- `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- A model directory bucket
- A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- A model type prefix (i.e. autogluon, tensorflow etc)
- A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
- This model name is made up of target variable, quality, time spent training and timestamp
- Within this prefix, there are three folders:
- model
- The model path that can be loaded in the codebase
- deployment
- The optimised model that can be deployed (may or maynot need this)
- metrics
- The metrics generatted from the model (may or may not need this as this can be contained in the registry)
- Once model build is finished, you can run the `prediction.py` file to generate prediction
- By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
- This can be overwritten by specifying a model_path, which will load an alternative model
- There are two ways of getting data into the pipeline:
- Using the `--data` argument:
- This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
- Note the single and double quotation marks, as this affects the ingestion
- Using the `--data-path` argument:
- This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
- An example of running the file is:
- `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- prediction bucket
- a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- a uprn prefix (i.e 0123456789)
- a `prediction.json`
- a `metadata.json`
- This is all the metadata from the model (can change this if needed)
- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
- It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
- I.e. the hyperparameters for models are in here but will move into a separate configuration file
# TODO:
- Structure/ MLOps:
- Add configuration files (dev, staging, prod), including hyperparamters
- Add precommit hooks (linters, branch names, etc)
- Sphinx documentation
- Sort out local mock up services
- Sort out Model Registry
- Sort out Data version control
- Data Science:
- Implement a metrics class, to hold all metric
- Rebuild metrics script (Could be a one off but good to have)
- Determine metrics
- Implement and test custom model (Tensorflow Decision Trees etc)
- Orchestration:
- Lambda handler for the pipeline

View file

@ -0,0 +1,25 @@
import pandas as pd
import os
class DataLoader:
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
df = df.set_index(index_col)
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
raise ValueError(f"File format not supported for file: {filepath}")
return df

View file

@ -2,7 +2,7 @@ from pathlib import Path
import numpy as np
import pandas as pd
from model_data.BaseUtility import Definitions
from simulation_system.Settings import (
from simulation_system.core.Settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,
@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
self.data = None
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)

View file

@ -0,0 +1,70 @@
"""
Create additional features from the dataset
"""
import pandas as pd
from typing import List
from model_data.simulation_system.core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
RANDOM_SEED = 0
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
"""
Remove the unused columns for RDS
"""
if target_column == "RDSAP_CHANGE":
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
elif target_column == "HEAT_DEMAND_CHANGE":
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
return df
@staticmethod
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep for modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
Sample data to reduce number of rows for model building if needed
"""
if subsample_amount:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
def process(
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] = None,
subsample_amount: int = None
) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
df = self.subsample_data(df, subsample_amount=subsample_amount)
df = self.drop_unused_columns(df, target_column=target_column)
df = self.retain_features(df, features=features)
return df

View file

@ -1,3 +1,7 @@
"""
Logger that will be used throughout the application
"""
import logging
def setup_logger():

View file

@ -1,5 +1,34 @@
# Using a simply python file as settings for now
# TODO: migrate to dynaconf
from pathlib import Path
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
MODEL_HYPERPARAMETERS = {
"autogluon": {
'problem_type': 'regression',
'eval_metric': 'mean_absolute_error',
'time_limit': 30,
'presets': 'medium_quality',
'excluded_model_types': None
}
}
RANDOM_SEED = 0
SUBSAMPLE_FACTOR = 200
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
TEST_DATA_NAME = 'test_data.parquet'
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = 'prediction.json'
METADATA_FILE = 'metadata.json'
MODEL_FOLDER = "model"
METRICS_FOLDER = "metrics"
DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45

View file

@ -0,0 +1,17 @@
version: '3'
services:
minio:
image: minio/minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ./data:/data
environment:
MINIO_ROOT_USER: &MINIO_USER admin
MINIO_ROOT_PASSWORD: &MINIO_PASS password
command: server --console-address ":9001" /data
# volumes:
# minio_storage: {}

View file

@ -1,5 +1,5 @@
from pathlib import Path
from Settings import (
from core.Settings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,

View file

@ -1,24 +1,24 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import Definitions
from pathlib import Path
from model_data.simulation_system.Settings import (
from core.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP
COLUMNS_TO_MERGE_ON
)
from DataProcessor import DataProcessor
from core.DataProcessor import DataProcessor
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
# TODO: Have a look at temporal features
def app():
# Get all the files in the directory
@ -85,9 +85,6 @@ def app():
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if len(vals) == 0:
wrong_var
fixed_data[field] = np.mean(vals)
# Combine all fields together

View file

@ -0,0 +1,139 @@
"""
Script to load MLModel class and generate predictions
"""
import json
import argparse
from model_data.simulation_system.MLModel.Models import AutogluonModel
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
import pandas as pd
from typing import Optional
from datetime import datetime
from model_data.simulation_system.core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
PREDICTION_FILE,
METADATA_FILE
)
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str,
help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
data_path: Optional[str] = None):
"""
Main pipeline function
"""
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path is None or not registry_path.exists():
logger.error("No registry path provided or registry doesn't exist")
exit(1)
if model_path is not None:
logger.info("User specified a model to load - ignoring registry")
model_location = model_path
model_type = model_path
model_name = model_path
else:
# TODO: Think about where registry will sit/ type
logger.info("Loading best model from registry")
registry_df = pd.read_csv(registry_path)
best_model_df = registry_df[registry_df['best_model']]
model_location = best_model_df['model_location'].values[0]
model_type = best_model_df['model_type'].values[0]
model_name = best_model_df['model_name'].values[0]
logger.info("--- Model Info: ---")
logger.info(f"Model type: {model_type}")
logger.info(f"Model name: {model_name}")
logger.info(f"Model location: {model_location}")
logger.info("--- Loading Data ---")
if data is None and data_path is None:
logger.error("No Data/Data Path passed")
exit(1)
if data_path and data is None:
logger.info("Loading data from provided path")
data = DataLoader().load(filepath=data_path, index_col="UPRN")
# TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
data = data.sample(1)
else:
logger.info('Using data provided')
data = json.loads(data)
data = pd.DataFrame([data])
print(data)
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
prediction = model.generate_predictions(data=data)
# Save prediction some where?
# prediction.to_csv("s3?")
# TODO: Check how we want to structure outputs
# For now, just categorise by uprn and timestamp
# Assume one uprn coming in for now
uprn = data.index.values[0]
# Saving prediction local for now
# TODO: change uprn to TARGET_ID, put in setting
logger.info("--- Outputting prediction and metadata --- ")
output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
output_base.mkdir(parents=True, exist_ok=True)
# TODO: change model.model.info to a class method for MLModel
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
prediction_metadata = {
"model_type": model_type,
"model_name": model_name,
"model_location": model_location,
"model_settings": model.model.info()
}
pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
return json_prediction
if __name__ == "__main__":
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path
# ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -0,0 +1,208 @@
absl-py==1.4.0
accelerate==0.16.0
aiohttp==3.8.5
aiohttp-cors==0.7.0
aiosignal==1.3.1
aliyun-python-sdk-core==2.13.36
aliyun-python-sdk-kms==2.16.1
antlr4-python3-runtime==4.9.3
asttokens==2.2.1
async-timeout==4.0.3
attrs==23.1.0
autogluon==0.8.2
autogluon.common==0.8.2
autogluon.core==0.8.2
autogluon.features==0.8.2
autogluon.multimodal==0.8.2
autogluon.tabular==0.8.2
autogluon.timeseries==0.8.2
backcall==0.2.0
beautifulsoup4==4.12.2
blessed==1.20.0
blis==0.7.10
boto3==1.28.25
botocore==1.31.25
cachetools==5.3.1
catalogue==2.0.9
catboost==1.2
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.6
cloudpickle==2.2.1
colorama==0.4.6
colorful==0.5.5
comm==0.1.4
confection==0.1.1
contourpy==1.1.0
crcmod==1.7
cryptography==41.0.3
cycler==0.11.0
cymem==2.0.7
datasets==2.14.4
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.7
distlib==0.3.7
evaluate==0.3.0
executing==1.2.0
fastai==2.7.12
fastcore==1.5.29
fastdownload==0.0.7
fastprogress==1.0.3
filelock==3.12.2
fonttools==4.42.0
frozenlist==1.4.0
fsspec==2023.6.0
future==0.18.3
gdown==4.7.1
gluonts==0.13.3
google-api-core==2.11.1
google-auth==2.22.0
google-auth-oauthlib==1.0.0
googleapis-common-protos==1.60.0
gpustat==1.1
graphviz==0.20.1
grpcio==1.50.0
huggingface-hub==0.16.4
hyperopt==0.2.7
idna==3.4
imageio==2.31.1
ipykernel==6.25.1
ipython==8.14.0
jedi==0.19.0
Jinja2==3.1.2
jmespath==0.10.0
joblib==1.3.2
jsonschema==4.17.3
jupyter_client==8.3.0
jupyter_core==5.3.1
kiwisolver==1.4.4
langcodes==3.3.0
lightgbm==3.3.5
lightning-utilities==0.9.0
llvmlite==0.40.1
Markdown==3.4.4
markdown-it-py==3.0.0
MarkupSafe==2.1.3
matplotlib==3.7.2
matplotlib-inline==0.1.6
mdurl==0.1.2
mlforecast==0.7.3
model-index==0.1.11
msgpack==1.0.5
multidict==6.0.4
multiprocess==0.70.15
murmurhash==1.0.9
nest-asyncio==1.5.7
networkx==3.1
nlpaug==1.1.11
nltk==3.8.1
nptyping==2.4.1
numba==0.57.1
numpy==1.24.4
nvidia-ml-py==12.535.77
oauthlib==3.2.2
omegaconf==2.2.3
opencensus==0.11.2
opencensus-context==0.1.3
opendatalab==0.0.10
openmim==0.3.9
openxlab==0.0.17
ordered-set==4.1.0
oss2==2.17.0
packaging==23.1
pandas==1.5.3
parso==0.8.3
pathy==0.10.2
patsy==0.5.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.5.0
platformdirs==3.10.0
plotly==5.16.0
preshed==3.0.8
prometheus-client==0.17.1
prompt-toolkit==3.0.39
protobuf==3.20.2
psutil==5.9.5
ptyprocess==0.7.0
pure-eval==0.2.2
py-spy==0.3.14
py4j==0.10.9.7
pyarrow==12.0.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
pycryptodome==3.18.0
pydantic==1.10.12
Pygments==2.16.1
pyparsing==3.0.9
pyrsistent==0.19.3
PySocks==1.7.1
pytesseract==0.3.10
python-dateutil==2.8.2
pytorch-lightning==1.9.5
pytorch-metric-learning==1.7.3
pytz==2023.3
PyWavelets==1.4.1
PyYAML==6.0.1
pyzmq==25.1.0
ray==2.3.1
regex==2023.8.8
requests==2.28.2
requests-oauthlib==1.3.1
responses==0.18.0
rich==13.4.2
rsa==4.9
s3transfer==0.6.1
safetensors==0.3.2
scikit-image==0.19.3
scikit-learn==1.2.2
scipy==1.11.1
seaborn==0.12.2
sentencepiece==0.1.99
seqeval==1.2.2
six==1.16.0
smart-open==6.3.0
soupsieve==2.4.1
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
srsly==2.4.7
stack-data==0.6.2
statsforecast==1.4.0
statsmodels==0.14.0
tabulate==0.9.0
tenacity==8.2.2
tensorboard==2.14.0
tensorboard-data-server==0.7.1
tensorboardX==2.6.2
text-unidecode==1.3
thinc==8.1.12
threadpoolctl==3.2.0
tifffile==2023.7.18
timm==0.9.5
tokenizers==0.13.3
toolz==0.12.0
torch==1.13.1
torchmetrics==0.11.4
torchvision==0.14.1
tornado==6.3.2
tqdm==4.65.1
traitlets==5.9.0
transformers==4.26.1
typer==0.9.0
typing_extensions==4.7.1
tzdata==2023.3
ujson==5.8.0
urllib3==1.26.16
virtualenv==20.24.3
wasabi==1.1.2
wcwidth==0.2.6
Werkzeug==2.3.6
window-ops==0.0.14
xgboost==1.7.6
xxhash==3.3.0
yarl==1.9.2

View file

@ -0,0 +1,2 @@
autogluon==0.8.2
pandas==1.5.3

View file

@ -0,0 +1,3 @@
autogluon==0.8.2
pandas==1.5.3
seaborn==0.12.2

View file

@ -1,9 +1,12 @@
from Logger import logger
from core.Logger import logger
import argparse
import pandas as pd
from pathlib import Path
RANDOM_SEED = 0
from core.Settings import (
RANDOM_SEED,
TRAIN_AND_VALIDATION_DATA_NAME,
TEST_DATA_NAME
)
def ingest_arguments() -> argparse.Namespace:
"""
@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
logger.info('--- Saving data ---')
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)
logger.info(' ---Pipeline complete---')

View file

@ -1,19 +1,47 @@
import os
import pandas as pd
import argparse
from typing import List
from Logger import logger
from autogluon.tabular import TabularDataset, TabularPredictor
# import boto3
from pathlib import Path
from datetime import datetime
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
from model_data.simulation_system.MLModel.Models import AutogluonModel
import pandas as pd
from model_data.simulation_system.core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
MODEL_FOLDER,
METRICS_FOLDER,
DEPLOYMENT_FOLDER,
SUBSAMPLE_FACTOR,
MODEL_HYPERPARAMETERS
)
import seaborn as sns
import matplotlib.pyplot as plt
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
FEATURE_COLUMNS = None
RANDOM_SEED = 0
# FOR TESTING
train_filepath = "./model_build_data/train_validation_data.parquet"
test_filepath = "./model_build_data/test_data.parquet"
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
# target_column = "RDSAP_CHANGE"
# model_type = "autogluon"
# hyperparameter = HYPERPARAMETERS
# SUBSAMPLE_FACTOR = 200
# SESSION = boto3.Session()
# S3_CLIENT = SESSION.client(
# service_name="s3",
# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
# )
# S3_CLIENT.create_bucket
# S3_CLIENT.list_buckets()
def ingest_arguments() -> argparse.Namespace:
"""
@ -22,122 +50,168 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable',
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
class DataLoader():
@staticmethod
def load(filepath: str) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
elif filepath.endswith('.csv.'):
df = pd.read_csv(filepath)
else:
logger.error('Not implemented!')
exit(1)
return df
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
df = df.drop(columns=[drop_columns])
return df
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep ofr modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
def process(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
df = self.retain_features(df, features=FEATURE_COLUMNS)
return df
def training(train_filepath: str, test_filepath: str) -> None:
def training(
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameters: dict = None
) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
logger.info('--- Loading data ---')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
logger.info('Feature processing')
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
train_df = feature_processor.process(train_df)
test_df = feature_processor.process(test_df)
# logger.info('Split data into train and validation')
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
logger.info('Build Model')
data = TabularDataset(data=train_filepath)
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
# data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/20)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
logger.info('--- Build Model ---')
# Add custom metric class MAPE
# Have a look at temporal features
logger.info("--- Load Hyperparameters ---")
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
if hyperparameters is None:
logger.info("Use base hyperparameters in settings")
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
logger.info(f'Hyperparameters are: {hyperparameters}')
if model_type == "autogluon":
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
output_filepath=output_base / MODEL_FOLDER
)
else:
raise ValueError("No alternative model implemented yet")
logger.info('Evaluate matrics')
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameters
)
test_data = TabularDataset('./model_build_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
validation_data=test_df,
target_column=target_column,
metrics_location=output_base / METRICS_FOLDER
)
logger.info("--- Generate metric outputs using predictions ---")
# TODO: can have a model.metric_outputs method
# FOr not just do it here
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
# image formatting
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
axis_fs = 18 # fontsize
title_fs = 22 # fontsize
sns.set(style="whitegrid")
ax = sns.scatterplot(x="true", y="pred", data=residual_df)
ax.set_aspect('equal')
ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
ax.set_title('Residuals', fontsize=title_fs)
# Square aspect ratio
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
plt.tight_layout()
RESIDUAL_FILE = "residuals.png"
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
# If we want residual plot/ any plots, we will need to self host
# plt.savefig(RESIDUAL_FILE, dpi=120)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info("--- Optimising model for deployment ---")
deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(registry_path, index_col=None)
else:
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
registry_df = pd.DataFrame(
columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
'model_type': model_type,
'model_name': model_root,
'model_location': deployment_model_path
}]
)
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
# regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
registry_df['best_model'] = [False] * len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
# Ensure the directory exists
registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
test_data['predictions'] = predictions
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
# To run script: python3 training.py --train-filepath
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
# ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
model_type=args.model_type
)

View file

@ -1,6 +1,7 @@
from copy import deepcopy
from backend.Property import Property
from statistics import mean
import random
def estimate_sap_points():
@ -9,7 +10,7 @@ def estimate_sap_points():
:return:
"""
return 999
return random.sample(range(4, 12), 1)[0]
def r_value_per_mm_to_u_value(depth_mm: int, r_value_per_mm: float):