merged in changes, change imports to stay within core for smaller dependency in dockerfile

This commit is contained in:
Michael Duong 2023-08-30 10:16:42 +01:00
commit 0f2cd9b09d
10 changed files with 53 additions and 43 deletions

4
.gitignore vendored
View file

@ -127,6 +127,7 @@ venv/
ENV/ ENV/
env.bak/ env.bak/
venv.bak/ venv.bak/
.training_env/
# Spyder project settings # Spyder project settings
.spyderproject .spyderproject
@ -252,6 +253,7 @@ backend/.idea
open_uprn/.idea/ open_uprn/.idea/
conservation_areas/.idea/ conservation_areas/.idea/
model_data/.idea/ model_data/.idea/
model_data/simulation_system/.idea/
model_data/simulation_system/data* model_data/simulation_system/data*
model_data/simulation_system/model_directory/

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser"> <component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" /> <option name="version" value="3" />
</component> </component>

0
__init__.py Normal file
View file

View file

@ -129,17 +129,15 @@ class AutogluonModel:
return metrics_df return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
""" """
We can optimise the deployment for a autogluon model We can optimise the deployment for a autogluon model
""" """
if self.model is None: if self.model is None:
logger.error("No model to optimise for deployment") raise ValueError("No model to optimise for deployment")
exit(1)
if deployment_path is None: if deployment_path is None:
logger.error("Deployment path required") raise ValueError("Deployment path required")
exit(1)
# This will return a string path of the location # This will return a string path of the location
return self.model.clone_for_deployment(deployment_path) return self.model.clone_for_deployment(deployment_path)

View file

@ -1,13 +1,18 @@
import pandas as pd import pandas as pd
from core.Logger import logger import os
class DataLoader():
class DataLoader:
@staticmethod @staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame: def load(filepath: str, index_col: str = None) -> pd.DataFrame:
""" """
Load different datasets Load different datasets
""" """
if not os.path.exists(filepath):
raise FileNotFoundError(f"File not found: {filepath}")
if filepath.endswith('.parquet'): if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath) df = pd.read_parquet(filepath)
if index_col is not None: if index_col is not None:
@ -15,7 +20,6 @@ class DataLoader():
elif filepath.endswith('.csv'): elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col) df = pd.read_csv(filepath, index_col=index_col)
else: else:
logger.error('Not implemented!') raise ValueError(f"File format not supported for file: {filepath}")
exit(1)
return df return df

View file

@ -23,6 +23,7 @@ class DataProcessor:
def __init__(self, filepath: Path) -> None: def __init__(self, filepath: Path) -> None:
self.filepath = filepath self.filepath = filepath
self.data = None
def load_data(self, low_memory=False) -> None: def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory) self.data = pd.read_csv(self.filepath, low_memory=low_memory)

View file

@ -6,18 +6,21 @@ import pandas as pd
from typing import List from typing import List
from core.Logger import logger from core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"]
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE'] HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"]
RANDOM_SEED = 0
RANDOM_SEED = 0
class FeatureProcessor: class FeatureProcessor:
""" """
Handle all feature manipulation before modelling Handle all feature manipulation before modelling
""" """
@staticmethod @staticmethod
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame: def drop_unused_columns(
df: pd.DataFrame, target_column: str = "RDSAP_CHANGE"
) -> pd.DataFrame:
""" """
Remove the unused columns for RDS Remove the unused columns for RDS
""" """
@ -36,13 +39,13 @@ class FeatureProcessor:
features = df.columns features = df.columns
else: else:
if not set(features).issubset(df.columns): if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data') logger.error("Features defined is not contained in data")
exit(1) exit(1)
df = df[features] df = df[features]
return df return df
@staticmethod @staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame: def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
""" """
@ -53,14 +56,13 @@ class FeatureProcessor:
df = df.sample(subsample_amount, random_state=RANDOM_SEED) df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df return df
def process( def process(
self, self,
df: pd.DataFrame, df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE", target_column: str = "RDSAP_CHANGE",
features: List[str] = None, features: List[str] = None,
subsample_amount: int = None subsample_amount: int = None,
) -> pd.DataFrame: ) -> pd.DataFrame:
""" """
Pipeline to get data ready for building a model Pipeline to get data ready for building a model
""" """

View file

@ -4,13 +4,12 @@ Script to load MLModel class and generate predictions
import json import json
import argparse import argparse
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from pathlib import Path
import pandas as pd import pandas as pd
from typing import Optional from typing import Optional
from datetime import datetime from datetime import datetime
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from core.Settings import ( from core.Settings import (
BASE_REGISTRY_PATH, BASE_REGISTRY_PATH,
REGISTRY_FILE, REGISTRY_FILE,
@ -23,7 +22,8 @@ from core.Settings import (
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
# FOR TESTING # FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1) # DATA = TEST_DATA.sample(1)
@ -110,6 +110,7 @@ def prediction(
logger.info("--- Loading Model ---") logger.info("--- Loading Model ---")
model = AutogluonModel() model = AutogluonModel()
model.load_model(filepath=model_location) model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---") logger.info("--- Generating Predictions ---")
@ -143,7 +144,6 @@ def prediction(
if __name__ == "__main__": if __name__ == "__main__":
args = ingest_arguments() args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'

View file

@ -3,11 +3,13 @@ import argparse
# import boto3 # import boto3
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from MLModel.Models import AutogluonModel
from core.Logger import logger from core.Logger import logger
from core.DataLoader import DataLoader from core.DataLoader import DataLoader
from core.FeatureProcessor import FeatureProcessor from core.FeatureProcessor import FeatureProcessor
from MLModel.Models import AutogluonModel
import pandas as pd
from core.Settings import ( from core.Settings import (
MODEL_DIRECTORY, MODEL_DIRECTORY,
BASE_REGISTRY_PATH, BASE_REGISTRY_PATH,
@ -30,8 +32,6 @@ from core.Settings import (
SEABORN_RESIDUAL_LINE_COLOUR, SEABORN_RESIDUAL_LINE_COLOUR,
SEABORN_RESIDUAL_LINE_WIDTH, SEABORN_RESIDUAL_LINE_WIDTH,
) )
import seaborn as sns
import matplotlib.pyplot as plt
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
@ -137,8 +137,7 @@ def training(
model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER) model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER)
else: else:
logger.error("No alternative model implemented yet") raise ValueError("No alternative model implemented yet")
exit(1)
model.train_model( model.train_model(
data=train_df, target_column=target_column, hyperparameters=hyperparameters data=train_df, target_column=target_column, hyperparameters=hyperparameters
@ -207,7 +206,6 @@ def training(
# TODO: Need a model registry - for now have this as a CSV # TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory # Save this in the model directory
logger.info("--- Append registry with new model ---") logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists(): if registry_path.exists():
@ -244,7 +242,8 @@ def training(
registry_row = pd.concat([model_details_df, metrics_df], axis=1) registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
# regenerate new metrics
# TODO: decide metric to optimise to # TODO: decide metric to optimise to
registry_df = registry_df.sort_values( registry_df = registry_df.sort_values(
"mean_absolute_error", ascending=False "mean_absolute_error", ascending=False
@ -253,6 +252,8 @@ def training(
registry_df.loc[0, "best_model"] = True registry_df.loc[0, "best_model"] = True
logger.info("--- Saving new model to registry ---") logger.info("--- Saving new model to registry ---")
# Ensure the directory exists
registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False) registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ") logger.info("--- Training Pipeline Complete --- ")
@ -265,7 +266,9 @@ if __name__ == "__main__":
logger.info("---Ingest Arguments---") logger.info("---Ingest Arguments---")
args = ingest_arguments() args = ingest_arguments()
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet # To run script: python3 training.py --train-filepath
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
# ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training( training(
train_filepath=args.train_filepath, train_filepath=args.train_filepath,