Merge pull request #133 from Hestia-Homes/mlmodel

Mlmodel
This commit is contained in:
KhalimCK 2023-08-25 12:22:59 +01:00 committed by GitHub
commit 4a73ebfb74
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 982 additions and 111 deletions

38
.github/workflows/cml.yml vendored Normal file
View file

@ -0,0 +1,38 @@
name: model-training
on:
push:
branches:
- mlmodel
permissions: write-all
jobs:
run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: iterative/setup-cml@v1
- name: Train model
env:
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
ls
cd model_data/simulation_system
pip install -r requirements.txt
python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
cd model_directory/RDSAP_CHANGE
echo "## Model metrics" > report.md
metrics_location=$(find . -maxdepth 10 -name "metrics.md")
echo $metrics_location
cat $metrics_location >> report.md
# echo "## Residuals plot from model" >> report.md
# metrics_location=$(find . -maxdepth 10 -name "residuals.png")
# echo $metrics_location
# cd $metric_location
# echo "![](./residuals.png)" >> report.md
cml comment create report.md
# cml comment create --log debug --publish false report.md

View file

@ -0,0 +1,59 @@
"""
BaseMLModel class
This is the base protocol:
- Any implementation will be its own seperate file
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from pathlib import Path
from typing import Protocol, NamedTuple
import pandas as pd
class MLModel(Protocol):
'''
Base ML Model protocol
'''
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameter: dict
) -> None:
"""
For the given data and hyperparameters (specified to the model), a model is trained
"""
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""
def optimise_model_for_deployment(self):
"""
Perfomance post processing on Model to ensure ready for deployment
"""
def generate_meta_data(self):
"""
"""

View file

@ -0,0 +1,142 @@
"""
Different implementations of the MLModel Protocol
Uses the BaseMLModel protocol
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from typing import NamedTuple
from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error
from core.Logger import logger
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
METRIC_FILENAME = "metrics.csv"
class AutogluonModel:
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self, output_filepath: Path = None) -> None:
self.model = None
self.output_filepath = output_filepath
self.predictions = None
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
self.model = TabularPredictor.load(path=filepath)
def save_model(self, output_filepath: Path = None) -> None:
"""
Providing a path, this function will save the model to be used.
"""
logger.info("Using AutoGluon Model - Model saving already occured")
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
if self.output_filepath is None:
logger.error("Please specify a output_filepath in order to train a model")
exit(1)
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
exit(1)
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
label=target_column,
path=self.output_filepath,
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
AGdata,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = self.model.predict(data)
return predictions
def model_evaluation(
self,
validation_data: pd.DataFrame,
target_column: str,
metrics_location: Path = None,
metric_filename: str = METRIC_FILENAME
) -> pd.DataFrame:
"""
For any validation data, a set of predictions and metrics are return
"""
if metrics_location is None:
logger.warning("Metrics will be outputted to current folder")
if self.model is None:
logger.error("No model loaded/ trained - Unable to generate evaluation")
exit(1)
performance = self.model.evaluate(validation_data)
predictions = self.generate_predictions(validation_data)
logger.info("Prediction used for evaluations are saved in self.prediction")
self.predictions = predictions
# TODO: Can have a custom metric class that defines all different metrics we want
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
performance['mape'] = metric_mape
logger.info("Saving metric file as metric.csv")
metrics_location.mkdir(exist_ok=True)
metrics_df = pd.DataFrame([performance])
metrics_df.to_csv(metrics_location / metric_filename)
markdown_filename = metric_filename.split(".")[0] + ".md"
metrics_df.to_markdown(metrics_location/ markdown_filename)
return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
logger.error("No model to optimise for deployment")
exit(1)
if deployment_path is None:
logger.error("Deployment path required")
exit(1)
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)

View file

@ -0,0 +1,14 @@
.PHONY: init
init: build docker
.PHONY: build
build:
docker-compose build
.PHONY: docker
docker:
docker-compose up -d
.PHONY: down
down:
docker compose down

View file

@ -0,0 +1,66 @@
# Simulation System
Starter Readme:
Steps for pipeline:
- (WIP) Use Makefile to start up mock up s3 service
- By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
- This docker compose is running in detached mode `-d`, so will no output anything to the terminal
- Once the Minio service is run, you can run the `training.py` file to start a model build process
- This will output a model, for a given target column, and add model name composed of some of the hyperparameters
- An example of running this file is:
- `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- A model directory bucket
- A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- A model type prefix (i.e. autogluon, tensorflow etc)
- A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
- This model name is made up of target variable, quality, time spent training and timestamp
- Within this prefix, there are three folders:
- model
- The model path that can be loaded in the codebase
- deployment
- The optimised model that can be deployed (may or maynot need this)
- metrics
- The metrics generatted from the model (may or may not need this as this can be contained in the registry)
- Once model build is finished, you can run the `prediction.py` file to generate prediction
- By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
- This can be overwritten by specifying a model_path, which will load an alternative model
- There are two ways of getting data into the pipeline:
- Using the `--data` argument:
- This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
- Note the single and double quotation marks, as this affects the ingestion
- Using the `--data-path` argument:
- This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
- An example of running the file is:
- `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- prediction bucket
- a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- a uprn prefix (i.e 0123456789)
- a `prediction.json`
- a `metadata.json`
- This is all the metadata from the model (can change this if needed)
- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
- It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
- I.e. the hyperparameters for models are in here but will move into a separate configuration file
# TODO:
- Structure/ MLOps:
- Add configuration files (dev, staging, prod), including hyperparamters
- Add precommit hooks (linters, branch names, etc)
- Sphinx documentation
- Sort out local mock up services
- Sort out Model Registry
- Sort out Data version control
- Data Science:
- Implement a metrics class, to hold all metric
- Rebuild metrics script (Could be a one off but good to have)
- Determine metrics
- Implement and test custom model (Tensorflow Decision Trees etc)
- Orchestration:
- Lambda handler for the pipeline

View file

@ -0,0 +1,21 @@
import pandas as pd
from core.Logger import logger
class DataLoader():
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
df = df.set_index(index_col)
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
logger.error('Not implemented!')
exit(1)
return df

View file

@ -2,7 +2,7 @@ from pathlib import Path
import numpy as np
import pandas as pd
from model_data.BaseUtility import Definitions
from simulation_system.Settings import (
from simulation_system.core.Settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,

View file

@ -0,0 +1,70 @@
"""
Create additional features from the dataset
"""
import pandas as pd
from typing import List
from core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
RANDOM_SEED = 0
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
"""
Remove the unused columns for RDS
"""
if target_column == "RDSAP_CHANGE":
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
elif target_column == "HEAT_DEMAND_CHANGE":
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
return df
@staticmethod
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep for modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
Sample data to reduce number of rows for model building if needed
"""
if subsample_amount:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
def process(
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] = None,
subsample_amount: int = None
) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
df = self.subsample_data(df, subsample_amount=subsample_amount)
df = self.drop_unused_columns(df, target_column=target_column)
df = self.retain_features(df, features=features)
return df

View file

@ -1,3 +1,7 @@
"""
Logger that will be used throughout the application
"""
import logging
def setup_logger():

View file

@ -1,5 +1,34 @@
# Using a simply python file as settings for now
# TODO: migrate to dynaconf
from pathlib import Path
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
MODEL_HYPERPARAMETERS = {
"autogluon": {
'problem_type': 'regression',
'eval_metric': 'mean_absolute_error',
'time_limit': 30,
'presets': 'medium_quality',
'excluded_model_types': None
}
}
RANDOM_SEED = 0
SUBSAMPLE_FACTOR = 200
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
TEST_DATA_NAME = 'test_data.parquet'
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = 'prediction.json'
METADATA_FILE = 'metadata.json'
MODEL_FOLDER = "model"
METRICS_FOLDER = "metrics"
DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45

View file

@ -0,0 +1,17 @@
version: '3'
services:
minio:
image: minio/minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ./data:/data
environment:
MINIO_ROOT_USER: &MINIO_USER admin
MINIO_ROOT_PASSWORD: &MINIO_PASS password
command: server --console-address ":9001" /data
# volumes:
# minio_storage: {}

View file

@ -1,5 +1,5 @@
from pathlib import Path
from Settings import (
from core.Settings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,

View file

@ -1,24 +1,24 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import Definitions
from pathlib import Path
from model_data.simulation_system.Settings import (
from core.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP
COLUMNS_TO_MERGE_ON
)
from DataProcessor import DataProcessor
from core.DataProcessor import DataProcessor
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
# TODO: Have a look at temporal features
def app():
# Get all the files in the directory
@ -85,9 +85,6 @@ def app():
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if len(vals) == 0:
wrong_var
fixed_data[field] = np.mean(vals)
# Combine all fields together

View file

@ -0,0 +1,134 @@
"""
Script to load MLModel class and generate predictions
"""
import json
import argparse
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from pathlib import Path
import pandas as pd
from typing import Optional
from datetime import datetime
from core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
PREDICTION_FILE,
METADATA_FILE
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
"""
Main pipeline function
"""
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path is None or not registry_path.exists():
logger.error("No registry path provided or registry doesn't exist")
exit(1)
if model_path is not None:
logger.info("User specified a model to load - ignoring registry")
model_location = model_path
model_type = model_path
model_name = model_path
else:
# TODO: Think about where registry will sit/ type
logger.info("Loading best model from registry")
registry_df = pd.read_csv(registry_path)
best_model_df = registry_df[registry_df['best_model']]
model_location = best_model_df['model_location'].values[0]
model_type = best_model_df['model_type'].values[0]
model_name = best_model_df['model_name'].values[0]
logger.info("--- Model Info: ---")
logger.info(f"Model type: {model_type}")
logger.info(f"Model name: {model_name}")
logger.info(f"Model location: {model_location}")
logger.info("--- Loading Data ---")
if data is None and data_path is None:
logger.error("No Data/Data Path passed")
exit(1)
if data_path and data is None:
logger.info("Loading data from provided path")
data = DataLoader().load(filepath=data_path, index_col="UPRN")
# TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
data = data.sample(1)
else:
logger.info('Using data provided')
data = json.loads(data)
data = pd.DataFrame([data])
print(data)
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
prediction = model.generate_predictions(data=data)
# Save prediction some where?
# prediction.to_csv("s3?")
# TODO: Check how we want to structure outputs
# For now, just categorise by uprn and timestamp
# Assume one uprn coming in for now
uprn = data.index.values[0]
# Saving prediction local for now
# TODO: change uprn to TARGET_ID, put in setting
logger.info("--- Outputting prediction and metadata --- ")
output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
output_base.mkdir(parents=True, exist_ok=True)
# TODO: change model.model.info to a class method for MLModel
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
prediction_metadata = {
"model_type": model_type,
"model_name": model_name,
"model_location": model_location,
"model_settings": model.model.info()
}
pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
return json_prediction
if __name__ == "__main__":
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -0,0 +1,208 @@
absl-py==1.4.0
accelerate==0.16.0
aiohttp==3.8.5
aiohttp-cors==0.7.0
aiosignal==1.3.1
aliyun-python-sdk-core==2.13.36
aliyun-python-sdk-kms==2.16.1
antlr4-python3-runtime==4.9.3
asttokens==2.2.1
async-timeout==4.0.3
attrs==23.1.0
autogluon==0.8.2
autogluon.common==0.8.2
autogluon.core==0.8.2
autogluon.features==0.8.2
autogluon.multimodal==0.8.2
autogluon.tabular==0.8.2
autogluon.timeseries==0.8.2
backcall==0.2.0
beautifulsoup4==4.12.2
blessed==1.20.0
blis==0.7.10
boto3==1.28.25
botocore==1.31.25
cachetools==5.3.1
catalogue==2.0.9
catboost==1.2
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.6
cloudpickle==2.2.1
colorama==0.4.6
colorful==0.5.5
comm==0.1.4
confection==0.1.1
contourpy==1.1.0
crcmod==1.7
cryptography==41.0.3
cycler==0.11.0
cymem==2.0.7
datasets==2.14.4
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.7
distlib==0.3.7
evaluate==0.3.0
executing==1.2.0
fastai==2.7.12
fastcore==1.5.29
fastdownload==0.0.7
fastprogress==1.0.3
filelock==3.12.2
fonttools==4.42.0
frozenlist==1.4.0
fsspec==2023.6.0
future==0.18.3
gdown==4.7.1
gluonts==0.13.3
google-api-core==2.11.1
google-auth==2.22.0
google-auth-oauthlib==1.0.0
googleapis-common-protos==1.60.0
gpustat==1.1
graphviz==0.20.1
grpcio==1.50.0
huggingface-hub==0.16.4
hyperopt==0.2.7
idna==3.4
imageio==2.31.1
ipykernel==6.25.1
ipython==8.14.0
jedi==0.19.0
Jinja2==3.1.2
jmespath==0.10.0
joblib==1.3.2
jsonschema==4.17.3
jupyter_client==8.3.0
jupyter_core==5.3.1
kiwisolver==1.4.4
langcodes==3.3.0
lightgbm==3.3.5
lightning-utilities==0.9.0
llvmlite==0.40.1
Markdown==3.4.4
markdown-it-py==3.0.0
MarkupSafe==2.1.3
matplotlib==3.7.2
matplotlib-inline==0.1.6
mdurl==0.1.2
mlforecast==0.7.3
model-index==0.1.11
msgpack==1.0.5
multidict==6.0.4
multiprocess==0.70.15
murmurhash==1.0.9
nest-asyncio==1.5.7
networkx==3.1
nlpaug==1.1.11
nltk==3.8.1
nptyping==2.4.1
numba==0.57.1
numpy==1.24.4
nvidia-ml-py==12.535.77
oauthlib==3.2.2
omegaconf==2.2.3
opencensus==0.11.2
opencensus-context==0.1.3
opendatalab==0.0.10
openmim==0.3.9
openxlab==0.0.17
ordered-set==4.1.0
oss2==2.17.0
packaging==23.1
pandas==1.5.3
parso==0.8.3
pathy==0.10.2
patsy==0.5.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==9.5.0
platformdirs==3.10.0
plotly==5.16.0
preshed==3.0.8
prometheus-client==0.17.1
prompt-toolkit==3.0.39
protobuf==3.20.2
psutil==5.9.5
ptyprocess==0.7.0
pure-eval==0.2.2
py-spy==0.3.14
py4j==0.10.9.7
pyarrow==12.0.1
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
pycryptodome==3.18.0
pydantic==1.10.12
Pygments==2.16.1
pyparsing==3.0.9
pyrsistent==0.19.3
PySocks==1.7.1
pytesseract==0.3.10
python-dateutil==2.8.2
pytorch-lightning==1.9.5
pytorch-metric-learning==1.7.3
pytz==2023.3
PyWavelets==1.4.1
PyYAML==6.0.1
pyzmq==25.1.0
ray==2.3.1
regex==2023.8.8
requests==2.28.2
requests-oauthlib==1.3.1
responses==0.18.0
rich==13.4.2
rsa==4.9
s3transfer==0.6.1
safetensors==0.3.2
scikit-image==0.19.3
scikit-learn==1.2.2
scipy==1.11.1
seaborn==0.12.2
sentencepiece==0.1.99
seqeval==1.2.2
six==1.16.0
smart-open==6.3.0
soupsieve==2.4.1
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
srsly==2.4.7
stack-data==0.6.2
statsforecast==1.4.0
statsmodels==0.14.0
tabulate==0.9.0
tenacity==8.2.2
tensorboard==2.14.0
tensorboard-data-server==0.7.1
tensorboardX==2.6.2
text-unidecode==1.3
thinc==8.1.12
threadpoolctl==3.2.0
tifffile==2023.7.18
timm==0.9.5
tokenizers==0.13.3
toolz==0.12.0
torch==1.13.1
torchmetrics==0.11.4
torchvision==0.14.1
tornado==6.3.2
tqdm==4.65.1
traitlets==5.9.0
transformers==4.26.1
typer==0.9.0
typing_extensions==4.7.1
tzdata==2023.3
ujson==5.8.0
urllib3==1.26.16
virtualenv==20.24.3
wasabi==1.1.2
wcwidth==0.2.6
Werkzeug==2.3.6
window-ops==0.0.14
xgboost==1.7.6
xxhash==3.3.0
yarl==1.9.2

View file

@ -1,9 +1,12 @@
from Logger import logger
from core.Logger import logger
import argparse
import pandas as pd
from pathlib import Path
RANDOM_SEED = 0
from core.Settings import (
RANDOM_SEED,
TRAIN_AND_VALIDATION_DATA_NAME,
TEST_DATA_NAME
)
def ingest_arguments() -> argparse.Namespace:
"""
@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
logger.info('--- Saving data ---')
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)
logger.info(' ---Pipeline complete---')

View file

@ -1,19 +1,49 @@
import os
import pandas as pd
import argparse
# import boto3
import os
from pathlib import Path
from datetime import datetime
from typing import List
from Logger import logger
from autogluon.tabular import TabularDataset, TabularPredictor
from core.Logger import logger
from core.DataLoader import DataLoader
from core.FeatureProcessor import FeatureProcessor
from MLModel.Models import AutogluonModel
import pandas as pd
from core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
MODEL_FOLDER,
METRICS_FOLDER,
DEPLOYMENT_FOLDER,
SUBSAMPLE_FACTOR,
MODEL_HYPERPARAMETERS
)
import seaborn as sns
import matplotlib.pyplot as plt
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
FEATURE_COLUMNS = None
RANDOM_SEED = 0
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
# FOR TESTING
train_filepath = "./model_build_data/train_validation_data.parquet"
test_filepath = "./model_build_data/test_data.parquet"
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
# target_column = "RDSAP_CHANGE"
# model_type = "autogluon"
# hyperparameter = HYPERPARAMETERS
# SUBSAMPLE_FACTOR = 200
# SESSION = boto3.Session()
# S3_CLIENT = SESSION.client(
# service_name="s3",
# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
# )
# S3_CLIENT.create_bucket
# S3_CLIENT.list_buckets()
def ingest_arguments() -> argparse.Namespace:
"""
@ -22,116 +52,148 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
class DataLoader():
@staticmethod
def load(filepath: str) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
elif filepath.endswith('.csv.'):
df = pd.read_csv(filepath)
else:
logger.error('Not implemented!')
exit(1)
return df
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
df = df.drop(columns=[drop_columns])
return df
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep ofr modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
def process(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
df = self.retain_features(df, features=FEATURE_COLUMNS)
return df
def training(train_filepath: str, test_filepath: str) -> None:
def training(
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameters: dict = None
) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
logger.info('--- Loading data ---')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
logger.info('Feature processing')
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
train_df = feature_processor.process(train_df)
test_df = feature_processor.process(test_df)
# logger.info('Split data into train and validation')
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
logger.info('Build Model')
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
logger.info('--- Build Model ---')
logger.info("--- Load Hyperparameters ---")
if hyperparameters is None:
logger.info("Use base hyperparameters in settings")
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
logger.info(f'Hyperparameters are: {hyperparameters}')
if model_type == "autogluon":
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
output_filepath = output_base / MODEL_FOLDER
)
else:
logger.error("No alternative model implemented yet")
exit(1)
data = TabularDataset(data=train_filepath)
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameters
)
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
# data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/20)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
validation_data=test_df,
target_column=target_column,
metrics_location = output_base / METRICS_FOLDER
)
logger.info("--- Generate metric outputs using predictions ---")
# TODO: can have a model.metric_outputs method
# FOr not just do it here
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
# image formatting
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
axis_fs = 18 #fontsize
title_fs = 22 #fontsize
sns.set(style="whitegrid")
ax = sns.scatterplot(x="true", y="pred",data=residual_df)
ax.set_aspect('equal')
ax.set_xlabel(f'True {target_column}',fontsize = axis_fs)
ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
ax.set_title('Residuals', fontsize = title_fs)
# Add custom metric class MAPE
# Have a look at temporal features
# Square aspect ratio
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
plt.tight_layout()
RESIDUAL_FILE = "residuals.png"
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
# If we want residual plot/ any plots, we will need to self host
# plt.savefig(RESIDUAL_FILE, dpi=120)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info('Evaluate matrics')
logger.info("--- Optimising model for deployment ---")
test_data = TabularDataset('./model_build_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(registry_path, index_col=None)
else:
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
'model_type': model_type,
'model_name': model_root,
'model_location': deployment_model_path
}]
)
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
registry_df['best_model'] = [False]*len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
test_data['predictions'] = predictions
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
@ -140,4 +202,11 @@ if __name__ == "__main__":
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
model_type=args.model_type
)