diff --git a/backend/Property.py b/backend/Property.py index 2358cbc2..0efcf545 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -127,7 +127,7 @@ class Property(Definitions): """ ventilation = self.data["mechanical-ventilation"] - # perform some simple cleaning - when checking 300k properties, the only unique values were + # perform some simple cleaning - when checking 300k property_change, the only unique values were # {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'} if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]: ventilation = None @@ -145,7 +145,7 @@ class Property(Definitions): - solar_pv This is based on the "photo-supply" field in the EPC data. - When checking 100k properties, either the value was "" or a stringified number + When checking 100k property_change, either the value was "" or a stringified number """ solar_pv = self.data["photo-supply"] diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index a8ad04a7..b2170eff 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -253,7 +253,7 @@ async def trigger_plan(body: PlanTriggerRequest): materials_by_type = filter_materials(materials) cleaned = get_cleaned() - logger.info("Getting components and properties recommendations") + logger.info("Getting components and property_change recommendations") # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers # in as a dependency and then the optimisers can take the input measures in as part of the setup() method diff --git a/model_data/EpcClean.py b/etl/epc_clean/EpcClean.py similarity index 100% rename from model_data/EpcClean.py rename to etl/epc_clean/EpcClean.py diff --git a/model_data/analysis/__init__.py b/etl/epc_clean/__init__.py similarity index 100% rename from model_data/analysis/__init__.py rename to etl/epc_clean/__init__.py diff --git a/model_data/cleaner_app.py b/etl/epc_clean/app.py similarity index 99% rename from model_data/cleaner_app.py rename to etl/epc_clean/app.py index 40011404..4a51ed58 100644 --- a/model_data/cleaner_app.py +++ b/etl/epc_clean/app.py @@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev") def app(): """ For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API - and produce a dataset of cleaned fields so that when we get new properties, we can quickly + and produce a dataset of cleaned fields so that when we get new property_change, we can quickly sanitise any description data Currently, this application is just run on a local machine diff --git a/model_data/simulation_system/core/DataProcessor.py b/etl/property_change/DataProcessor.py similarity index 79% rename from model_data/simulation_system/core/DataProcessor.py rename to etl/property_change/DataProcessor.py index 439ce2e0..e05ae8dc 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/etl/property_change/DataProcessor.py @@ -16,7 +16,6 @@ from model_data.simulation_system.core.Settings import ( RDSAP_RESPONSE, MAX_SAP_SCORE, fill_na_map, - FIXED_DESCRIPTON_MAPPED_FEATURES ) from typing import List @@ -348,7 +347,7 @@ class DataProcessor: cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE") - # If there still is na values, use average across all properties in consituecy + # If there still is na values, use average across all property_change in consituecy cleaning_averages_filled[variable] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[variable].mean()) @@ -528,126 +527,3 @@ class DataProcessor: df[column] = df[column].astype(bool) return df - - @classmethod - def difference_data(cls, df: pd.DataFrame): - - """ - Given a dataframe and starting and ending columns, this function will convert the features to - differenced the ending subtract the starting value, which is useful for modelling the difference responces - """ - - # We ensure that the u value columns are co-erced to a numerical format - uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col] - for uvalue_col in uvalue_columns: - df[uvalue_col] = pd.to_numeric(df[uvalue_col]) - - key_columns = [ - "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", - "SAP_STARTING", "HEAT_DEMAND_STARTING", - "CARBON_STARTING", "UPRN", "CONSTITUENCY", - "SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING", - "DAYS_TO_STARTING", "DAYS_TO_ENDING" - ] - - ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns - - columns = {x for x in df.columns if x not in ignore_cols} - - non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist() - non_numerical_columns = [col for col in non_numerical_columns if col in columns] - levels = {col: df[col].unique().tolist() for col in non_numerical_columns} - - df = pd.get_dummies(df, columns=non_numerical_columns) - - # We make sure there is a starting and ending version of the column - diff_columns = [] - no_diff_columns = [] # Store for debugging - for col in columns: - if "_ENDING" in col: - # Don't keep the endings - continue - else: - # We have a starting column so check if we have an ending - if col.replace("_STARTING", "") + "_ENDING" in columns: - diff_columns.append(col) - else: - no_diff_columns.append(col) - - if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns): - raise Exception("Something went wrong, potentially missed a differencing column") - - datatypes = df.dtypes - - # Note: We also difference columns like floor area and floor height. We should experiement with this. - # Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by - # the starting value, therefore to explain any differences in the new floor area, it may be enough to - # just consider the difference however we can play around with this. - - # Do the differencing - cols_to_append = {} - for starting_col in diff_columns: - - base_col = starting_col.replace("_STARTING", "") - - if "_STARTING" in starting_col: - ending_col = starting_col.replace("_STARTING", "_ENDING") - else: - ending_col = starting_col + "_ENDING" - - if starting_col not in non_numerical_columns: - cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col] - df = df.drop(columns=[starting_col, ending_col]) - continue - - level_values = list(set(levels[starting_col] + levels[ending_col])) - - level_cols = [] - for level in level_values: - starting_level_col = "_".join([starting_col, str(level)]) - ending_level_col = "_".join([ending_col, str(level)]) - - if starting_level_col not in df.columns: - # We have no starting, just ending - col_type = datatypes[ending_level_col].name - - if col_type == "bool": - cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int) - else: - cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - - level_cols.append(ending_level_col) - - elif ending_level_col not in df.columns: - # We have no ending, just starting - col_type = datatypes[starting_level_col].name - - if col_type == "bool": - cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int) - else: - cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col] - - level_cols.append(starting_level_col) - - else: - col_type = datatypes[starting_level_col].name - - if col_type == "bool": - cols_to_append[f"{base_col}_{level}_DIFF"] = ( - df[ending_level_col].astype(int) - df[starting_level_col].astype(int) - ) - else: - cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col] - - level_cols.extend([starting_level_col, ending_level_col]) - - # Drop the columns - df = df.drop(columns=level_cols) - - cols_to_append = pd.DataFrame(cols_to_append) - df = pd.concat([df, cols_to_append], axis=1) - - # Perform a final coercing of string True/False columns to boolean - df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns) - - return df diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/etl/property_change/FeatureProcessor.py similarity index 100% rename from model_data/simulation_system/core/FeatureProcessor.py rename to etl/property_change/FeatureProcessor.py diff --git a/model_data/plotting/__init__.py b/etl/property_change/__init__.py similarity index 100% rename from model_data/plotting/__init__.py rename to etl/property_change/__init__.py diff --git a/model_data/simulation_system/generate_rdsap_change.py b/etl/property_change/app.py similarity index 99% rename from model_data/simulation_system/generate_rdsap_change.py rename to etl/property_change/app.py index e3d3bd0e..570de202 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/etl/property_change/app.py @@ -4,7 +4,7 @@ from tqdm import tqdm import msgpack from pathlib import Path -from model_data.simulation_system.core.Settings import ( +from etl.property_change.settings import ( MANDATORY_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, @@ -14,8 +14,8 @@ from model_data.simulation_system.core.Settings import ( EARLIEST_EPC_DATE, CARBON_RESPONSE, ) -from model_data.simulation_system.core.DataProcessor import DataProcessor -from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet +from etl.property_change.DataProcessor import DataProcessor +from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter, diff --git a/etl/properties/requirements.txt b/etl/property_change/requirements.txt similarity index 100% rename from etl/properties/requirements.txt rename to etl/property_change/requirements.txt diff --git a/etl/property_change/settings.py b/etl/property_change/settings.py new file mode 100644 index 00000000..e64ba210 --- /dev/null +++ b/etl/property_change/settings.py @@ -0,0 +1,213 @@ +# Using a simply python file as settings for now +# TODO: migrate to dynaconf +from pathlib import Path + +METRIC_FILENAME = "metrics.csv" + +OPTIMISE_METRIC = "mean_absolute_error" +BEST_MODEL_COLUMN_NAME = "best_model" + +# TODO: remove these setting elsewhere for CML +RESIDUAL_TRUE_LABEL = "true" +RESIDUAL_PREDICTION_LABEL = "pred" +RESIDUAL_FILE = "residual.png" +SEABORN_RESIDUAL_AXIS_FONTSIZE = 12 +SEABORN_RESIDUAL_TITLE_FONTSIZE = 22 +SEABORN_RESIDUAL_STYLE = "whitegrid" +SEABORN_RESIDUAL_ASPECT_RATIO = "equal" +SEABORN_RESIDUAL_PLOT_DPI = 120 +SEABORN_RESIDUAL_RANGE = [-100, 100] +SEABORN_RESIDUAL_LINE_COLOUR = "black" +SEABORN_RESIDUAL_LINE_WIDTH = 1 + +# Can move to a hyperparmeters file +# If anything we might want to have a file that can be loaded and sent to this script +MODEL_HYPERPARAMETERS = { + "autogluon": { + "problem_type": "regression", + "eval_metric": "mean_absolute_error", + "time_limit": 45, + "presets": "medium_quality", + "excluded_model_types": None, + } +} + +TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S" + +RANDOM_SEED = 0 +SUBSAMPLE_FACTOR = 200 + +TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet" +TEST_DATA_NAME = "test_data.parquet" + +REGISTRY_FILE = "model_registry.csv" +MODEL_DIRECTORY = "model_directory" +BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY +PREDICTION_LOCATION = Path("predictions") +PREDICTION_FILE = "prediction.json" +METADATA_FILE = "metadata.json" +MODEL_FOLDER = "model" +METRICS_FOLDER = "metrics" +DEPLOYMENT_FOLDER = "deployment" + +TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 +FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 + +AVERAGE_FIXED_FEATURES = [ + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + "FIXED_LIGHTING_OUTLETS_COUNT", +] + +COLUMNS_TO_MERGE_ON = [ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", +] + +FULLY_GLAZED_DESCRIPTIONS = [ + "Fully double glazed", + "High performance glazing", + "Fully triple glazed", + "Full secondary glazing", + "Multiple glazing throughout", +] + +FIXED_FEATURES = [ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "NUMBER_HABITABLE_ROOMS", + "CONSTITUENCY", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", +] + +COMPONENT_FEATURES = [ + "TRANSACTION_TYPE", + "WALLS_DESCRIPTION", + "FLOOR_DESCRIPTION", + "LIGHTING_DESCRIPTION", + "ROOF_DESCRIPTION", + "MAINHEAT_DESCRIPTION", + "HOTWATER_DESCRIPTION", + "MAIN_FUEL", + "MECHANICAL_VENTILATION", + "SECONDHEAT_DESCRIPTION", + "ENERGY_TARIFF", # Not sure if this is relevant + "SOLAR_WATER_HEATING_FLAG", + "PHOTO_SUPPLY", + "WINDOWS_DESCRIPTION", + "GLAZED_TYPE", + "MULTI_GLAZE_PROPORTION", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "MAINHEATCONT_DESCRIPTION", + "EXTENSION_COUNT", + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION +] + +# For these fields, we take the latest value if we have multiple values +# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is +# the most accurate +LATEST_FIELD = [ + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for +] + +# If we see thee features changing, we don't use the EPC, since deem it not to be reliable +MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"] + +# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were +# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England +# and Wales from 31 July 2014 +EARLIEST_EPC_DATE = "2014-08-01" + +RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" +HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" +CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT" + + +def ordinal(n): + if 10 <= n % 100 <= 20: + suffix = "th" + else: + suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") + + return str(n) + suffix + + +FLOOR_LEVEL_MAP = { + "Basement": -1, + "Ground": 0, + "ground floor": 0, + "20+": 20, + "21st or above": 21, + **{str(i).zfill(2): i for i in range(0, 21)}, + **{ordinal(i): i for i in range(-1, 21)}, + **{str(i): i for i in range(-1, 21)}, + **{i: i for i in range(-1, 21)}, +} + +BUILT_FORM_REMAP = { + "Enclosed End-Terrace": "End-Terrace", + "Enclosed Mid-Terrace": "Mid-Terrace", +} + +DATA_PROCESSOR_SETTINGS = { + "low_memory": False, + "epc_minimum_count": 1, + "column_mappings": {"UPRN": [int, str]}, +} + +# This has a manual mapping of the column types required +COLUMNTYPES = { + 'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object', + 'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64', + 'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', + 'CONSTRUCTION_AGE_BAND': 'object', + 'TRANSACTION_TYPE': 'object', + 'WALLS_DESCRIPTION': 'object', + 'FLOOR_DESCRIPTION': 'object', + 'LIGHTING_DESCRIPTION': 'object', + 'ROOF_DESCRIPTION': 'object', + 'MAINHEAT_DESCRIPTION': 'object', + 'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object', + 'MECHANICAL_VENTILATION': 'object', + 'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object', + 'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64', + 'WINDOWS_DESCRIPTION': 'object', + 'GLAZED_TYPE': 'object', + 'MULTI_GLAZE_PROPORTION': 'float64', + 'LOW_ENERGY_LIGHTING': 'float64', + 'NUMBER_OPEN_FIREPLACES': 'float64', + 'MAINHEATCONT_DESCRIPTION': 'object', + 'EXTENSION_COUNT': 'float64', + 'LODGEMENT_DATE': 'object', +} + +# For modelling, we don't allow records with more than 100 SAP points +MAX_SAP_SCORE = 100 + +fill_na_map = { + # There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from + # a community network" that could be clustered with unknown fuel + "MAIN_FUEL": "UNKNOWN", + "MECHANICAL_VENTILATION": "Unknown", + "SECONDHEAT_DESCRIPTION": "None", + "ENERGY_TARIFF": "Unknown", + # We set solar water heating flag to N - we could investigate using a different category entirely + "SOLAR_WATER_HEATING_FLAG": "N", + "GLAZED_TYPE": "not defined", + "MULTI_GLAZE_PROPORTION": 0, + "LOW_ENERGY_LIGHTING": 0, + "MAINHEATCONT_DESCRIPTION": "Unknown", + "EXTENSION_COUNT": 0, + "NUMBER_OPEN_FIREPLACES": 0 +} diff --git a/model_data/BoreholeClient.py b/etl/spatial/BoreholeClient.py similarity index 98% rename from model_data/BoreholeClient.py rename to etl/spatial/BoreholeClient.py index 831440df..cd7df667 100644 --- a/model_data/BoreholeClient.py +++ b/etl/spatial/BoreholeClient.py @@ -56,7 +56,7 @@ class BoreholeClient: # EXAMPLE # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of - # entries in here if possible before we produce any form of comparison between our properties, to infer + # entries in here if possible before we produce any form of comparison between our property_change, to infer # the distance from the property to the nearest borehole # Let's take a sample diff --git a/model_data/simulation_system/MLModel/__init__.py b/etl/spatial/__init__.py similarity index 100% rename from model_data/simulation_system/MLModel/__init__.py rename to etl/spatial/__init__.py diff --git a/etl/wall_area/__init__.py b/etl/wall_area/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/area_data.py b/etl/wall_area/app.py similarity index 99% rename from model_data/simulation_system/area_data.py rename to etl/wall_area/app.py index f43093f1..9eef73f8 100644 --- a/model_data/simulation_system/area_data.py +++ b/etl/wall_area/app.py @@ -1,5 +1,5 @@ """ -This script produces the dataset used to model the wall area of properties, which is used to estimate the cost +This script produces the dataset used to model the wall area of property_change, which is used to estimate the cost of insulation measures within homes """ import os diff --git a/etl/wall_area/requirements.txt b/etl/wall_area/requirements.txt new file mode 100644 index 00000000..56790fc5 --- /dev/null +++ b/etl/wall_area/requirements.txt @@ -0,0 +1 @@ +PyPDF2 \ No newline at end of file diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 399a6ff5..5f4fdba1 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -83,7 +83,7 @@ resource "aws_db_instance" "default" { publicly_accessible = true } -# Set up the bucket that recieve the csv uploads of properties to be retrofit +# Set up the bucket that recieve the csv uploads of property_change to be retrofit module "s3_presignable_bucket" { source = "./modules/s3_presignable_bucket" bucketname = "retrofit-plan-inputs-${var.stage}" diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py deleted file mode 100644 index 021e503d..00000000 --- a/model_data/analysis/SapModel.py +++ /dev/null @@ -1,650 +0,0 @@ -import numpy as np -import pandas as pd -import statsmodels.api as sm -import matplotlib.pyplot as plt -from typing import Dict, Optional, List -from sklearn.model_selection import train_test_split -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ - median_absolute_error, mean_absolute_percentage_error -from sklearn.ensemble import RandomForestRegressor -from sklearn.inspection import permutation_importance -from model_data.EpcClean import EpcClean - -from statsmodels.stats.outliers_influence import variance_inflation_factor -from tqdm import tqdm -from utils.logger import setup_logger - -logger = setup_logger() - - -class SapModel: - # We want to estimate for making improvements on different property components - RESPONSE = "current-energy-efficiency" - # We could potentially build models by constituency to avoid having too many - # features in the model - BASE_FEATURES = [ - "property-type", - "built-form", - "construction-age-band", - "number-habitable-rooms", - "constituency", - "number-heated-rooms", - "transaction-type" - ] - - COMPONENT_FEATURES = [ - "walls-description", - "floor-description", - "lighting-description", - "roof-description", - "mainheat-description", - "hotwater-description", - "main-fuel", - "mechanical-ventilation", - "secondheat-description", - "energy-tariff", - "solar-water-heating-flag", - "photo-supply", - "windows-description", - "glazed-type", - "glazed-area", - "multi-glaze-proportion", - # "lighting-description" # Might not need to use this - "low-energy-lighting", - "number-open-fireplaces", - "mainheatcont-description", - "fixed-lighting-outlets-count", - "floor-height", - "floor-level", - "total-floor-area", - "extension-count", - ] - - CATEGORICAL_COLS = [ - "property-type", - "built-form", - "number-habitable-rooms", - "constituency", - "number-heated-rooms", - "mainheat-description", - "hotwater-description", - "main-fuel", - "mechanical-ventilation", - "secondheat-description", - "energy-tariff", - "solar-water-heating-flag", - "windows-description", - "glazed-type", - "glazed-area", - "construction-age-band", - "lighting-description", - "mainheatcont-description", - "floor-level", - ] - - NUMERICAL_COLUMNS = [ - "photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces", - "fixed-lighting-outlets-count", - "floor-height", - "total-floor-area", - "extension-count", - ] - - # For the moment, we store records of the best performing models as a benchmark for future imporvements - BEST_FIT = { - 'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763, - 'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118, - 'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197 - } - - BEST_PREDICT = { - 'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514, - 'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312, - 'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828 - } - - BEST_FINAL = { - 'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157, - 'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832, - 'Median Absolute Error': 1.9487883489495985 - } - - BUCKET_VARIABLES = [ - "number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion' - ] - - def __init__( - self, data: List[Dict], - cleaner: EpcClean, - test_size: Optional[float] = 0.2, - random_state: Optional[int] = None - ): - self.df = pd.DataFrame(data) - self.cleaner = cleaner - self.random_state = random_state if random_state is not None else 42 - self.test_size = 0.2 if test_size is None else test_size - - self.model_data = None - self.train_x = None - self.train_y = None - self.test_x = None - self.test_y = None - - self.test_model = None - self.final_model = None - - self.fit_error = None - self.predict_error = None - self.final_error = None - self.worst = { - "fit_errors": pd.DataFrame(), - "prediction_errors": pd.DataFrame(), - "fit_x": pd.DataFrame(), - "prediction_x": pd.DataFrame(), - "final_errors": pd.DataFrame(), - "final_x": pd.DataFrame(), - } - - self.fit_df = None - self.predict_df = None - self.final_fit_df = None - self.diagnosis = {} - - def run(self, plot: bool = False) -> None: - """ - A pipeline method to run all necessary methods in correct order. - :param plot: Boolean to indicate whether to plot the regression - """ - try: - self.create_dataset() - self.fit_model() - if plot: - self.plot_regression(self.fit_df) - except Exception as e: - logger.error("An error occurred during execution.") - logger.error(str(e)) - - def _merge_with_u_values( - self, model_data: pd.DataFrame, description: str, thermal_transmittance: str - ) -> pd.DataFrame: - - """ - Utility function to merge u value data with model data - :param model_data: Pandas dataframe which is the main modelling dataset - :param description: Name of the description column for which we're merging u-values onto - :param thermal_transmittance: Name of the thermal transmittance column - :return: - """ - - u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[ - ["original_description", thermal_transmittance]].rename( - columns={thermal_transmittance: f"{description}_u_value"} - ) - - model_data = model_data.merge( - u_values, - how="left", - left_on=f"{description}-description", - right_on="original_description" - ).drop(columns=["original_description"]) - - return model_data - - def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame: - """ - Appends cleaned data into the model data. - :param model_data: Original model data. - :return: Model data with cleaned data appended. - """ - for description in ["walls", "floor", "roof"]: - model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance") - - # lighting_proportions added separately as it doesn't use the _merge_with_u_values method - lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[ - ["original_description", "low_energy_proportion"]] - - model_data = model_data.merge( - lighting_proportions, - how="left", - left_on="lighting-description", - right_on="original_description" - ).drop(columns=["original_description"]) - - return model_data - - @staticmethod - def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame: - """ - Converts transaction type to boolean - :param model_data: Model data with transaction type. - :return: Model data with converted transaction type. - """ - model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling" - model_data = model_data.drop(columns=["transaction-type"]) - return model_data - - @staticmethod - def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame: - """ - Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD" - :param df: Dataframe of features to be binned - :param column_name: Name of the column to be binned - :param n_bins: Number of bins to use - :return: Dataframe with binned column - """ - # Check if the column is numerical - if np.issubdtype(df[column_name].dtype, np.number): - # Create a new categorical column from numerical one by binning the data - df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str) - # Replace missing data with "NO_RECORD" - df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD") - df[column_name + "_bucket"] = np.where( - df[column_name + "_bucket"] == "nan", - "NO_RECORD", - df[column_name + "_bucket"] - ) - return df - - def _clean_numericals(self, model_data): - - # Try binning numericals - remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES] - - for col in self.BUCKET_VARIABLES: - model_data[col] = pd.to_numeric(model_data[col], errors='coerce') - # If all values are missing, set all values to 0 - this column will get dropped - if all(pd.isnull(model_data[col])): - model_data[col + "_bucket"] = "NO_RECORD" - continue - model_data = self.bucket_and_fill(model_data, col) - - # Replace the data with the binned version - model_data = model_data.drop(columns=self.BUCKET_VARIABLES) - model_data = model_data.rename( - columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES)) - ) - - # Basic fill the rest of the columns with 0 - currenrtly this provided the best performance - for col in remaining_numericals: - model_data[col] = np.where( - model_data[col] == "", "0", model_data[col] - ).astype(float) - - return model_data - - @staticmethod - def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame: - """ - Fills categorical missing data with sensible values - :param model_data: Original model data. - :return: Model data with cleaned categorical data. - """ - - # Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is - # potentially - # a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating - - model_data["mechanical-ventilation"] = np.where( - model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"] - ) - - model_data["solar-water-heating-flag"] = np.where( - model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"] - ) - - model_data["glazed-type"] = np.where( - model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"] - ) - - model_data["glazed-area"] = np.where( - model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"] - ) - - return model_data - - def create_dataset(self): - logger.info("Creating modelling dataset") - model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES] - model_data = model_data.reset_index(drop=True) - model_data["idx"] = model_data.index.copy() - - # Append on u-values - model_data = self._append_cleaned_data(model_data) - - model_data = self.clean_missings(model_data) - - # Convert transaction_type - model_data = self._convert_transaction_type(model_data) - - # Clean numerical columns - model_data = self._clean_numericals(model_data) - - # Take just entries with U-values - # TODO: Rather than doing this, do we want to include the estimated u-values? - # Since this ends up with just 2k entries - model_data = model_data[ - ~pd.isnull(model_data["walls_u_value"]) & - ~pd.isnull(model_data["floor_u_value"]) & - ~pd.isnull(model_data["roof_u_value"]) - ] - - exclude_features = [ - "walls-description", "floor-description", "roof-description", "transaction-type" - ] - - features = [ - x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [ - "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap" - ] if x not in exclude_features - ] - - model_data = model_data[features] - - for col in self.CATEGORICAL_COLS: - model_data[col] = model_data[col].astype('category') - - # Convert response - model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float) - - self.model_data = model_data - - def make_training_test(self, x): - # Split into training and test - self.train_x, self.test_x, self.train_y, self.test_y = train_test_split( - x.drop(self.RESPONSE, axis=1), - x[self.RESPONSE], - test_size=self.test_size, - random_state=self.random_state - ) - - @staticmethod - def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3): - """ - Utility function to remove columns that have zero standard deviation from both test and train sets - :param train_x: Training data to remove columns from - :param test_x: If provided, remove the same columns from the test data - :param threshold: float value, if the standard deviation is below this threshold, the column is considered - to have zero standard deviation - :return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned - """ - # Compute standard deviations - std_devs = train_x.std() - - # Find columns with zero or near-zero standard deviation - zero_std_cols = std_devs[std_devs <= threshold].index - - # Drop these columns from the training data - train_x = train_x.drop(zero_std_cols, axis=1) - - if test_x is not None: - # Ensure the test data has the same columns - test_x = test_x[train_x.columns] - return train_x, test_x - - return train_x, None - - def fit_model(self): - """ - Main function to fit the model and produce accuracy metrics - """ - - x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True) - - # Convert booleans to integer - for col in x.columns: - if x[col].dtype == bool: - x[col] = x[col].astype(int) - - if x[col].dtype == object: - x[col] = x[col].astype(float) - - # Create the training and test sets for each run - self.make_training_test(x) - self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x) - logger.info("Detecting multi-collinearity in training dataset") - self.detect_multi_collinearity() - - # Add a constant to the independent value - train_x = sm.add_constant(self.train_x) - test_x = sm.add_constant(self.test_x) - train_idx = train_x["idx"].copy() - test_idx = self.test_x["idx"].copy() - train_x = train_x.drop(columns=["idx"]) - test_x = test_x.drop(columns=["idx"]) - - logger.info("Fitting testing model") - # make regression model - model = sm.OLS(self.train_y, train_x) - # fit model and print results - self.test_model = model.fit() - - train_predictions = self.test_model.fittedvalues - test_predictions = self.test_model.predict(test_x) - - self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics( - y_true=self.train_y, y_pred=train_predictions - ) - - # Predict on new data - self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics( - y_true=self.test_y, y_pred=test_predictions - ) - - fit_success = self.check_successes(self.fit_error, self.BEST_FIT) - predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT) - - self.model_data['fit'] = self.test_model.fittedvalues - # The worst errors over index heavily for flats - self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)] - self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)] - - self.fit_df = pd.DataFrame( - { - "fit": train_predictions, - "actual": self.train_y, - "idx": train_idx - } - ).sort_values("actual", ascending=True) - - self.predict_df = pd.DataFrame( - { - "predictions": test_predictions, - "actual": self.test_y, - "idx": test_idx - } - ) - - self.diagnosis = { - "fit_success": fit_success, - "predict_success": predict_success, - "summary": self.test_model.summary() - } - - # We're now ready to fit the final model - # For the momeent, the pre-processing at the top of this function merely removes columns, so we - # just need to remove the columns that were removed from the training data from the final model - logger.info("Fitting final model") - x = sm.add_constant(x) - y = x[self.RESPONSE] - x = x[self.train_x.columns] - idx = x["idx"].copy() - x = x.drop(columns=["idx"]) - - final_model = sm.OLS(y, x) - # fit model and print results - self.final_model = final_model.fit() - final_predictions = self.final_model.fittedvalues - - self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics( - y_true=y, y_pred=final_predictions - ) - - self.final_fit_df = pd.DataFrame( - { - "fit": final_predictions, - "actual": y, - "idx": idx - } - ).sort_values("actual", ascending=True) - - @staticmethod - def check_successes(experiment_error, best_error): - """ - Simple function to check if the experiment error is better than the best error - :param experiment_error: output of calculate_regression_metrics() on the experiment - :param best_error: Current benchmark best error - :return: - """ - - successes = [] - for k in experiment_error: - if k in ["Explained Variance Score", "R2 Score"]: - # We want to maximise this so we want experiment error to be higher - successes.append( - { - "measure": k, - "success": experiment_error[k] >= best_error[k], - "difference": abs(experiment_error[k] - best_error[k]) - } - ) - continue - successes.append( - { - "measure": k, - "success": experiment_error[k] <= best_error[k], - "difference": abs(experiment_error[k] - best_error[k]) - } - ) - - return pd.DataFrame(successes) - - def rf_importance(self, train_x, train_y, test_x, test_y): - """ - Utility function to estimate feature importance using a random forest - This is useful to get a sense of some of the key features which are driving model - performance - - :param train_x: Training data covariates to build the importance model on - :param train_y: Training data response to build the importance model on - :param test_x: Test data covariates to build the permutation importance model on - :param test_y: Test data response to build the permutation importance model on - :return: Pandas dataframe of feature importances, ranked by most important to least - """ - - rf = RandomForestRegressor(random_state=self.random_state) - rf.fit(train_x, train_y) - - # Print the name and importance of each feature - rf_importance_df = [] - for feature, importance in zip(train_x.columns, rf.feature_importances_): - rf_importance_df.append( - { - "Feature": feature, - "rf_importance": importance - } - ) - rf_importance_df = pd.DataFrame(rf_importance_df) - rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False) - - perm_importance = self.permuation_importance(rf, test_x, test_y) - - return rf_importance_df, perm_importance - - @staticmethod - def permuation_importance(rf, test_x, test_y): - """ - Simple utility function to produce permutation importance for a given model\ - :param rf: Random forest model to calculate permutation importance for - :param test_x: Test covariates to be used for permutation importance - :param test_y: Test response to be used for permutation importance - :return: - """ - perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error') - perm_importance_df = pd.DataFrame( - { - "Feature": test_x.columns, - "perm_importance": perm_importance.importances_mean - } - ).sort_values(by="perm_importance", ascending=False) - - return perm_importance_df - - def detect_multi_collinearity(self): - # Get the VIFs for each variable - vifs = pd.DataFrame() - vifs["features"] = self.train_x.columns - vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))] - - # Get the features with the highest VIF - vifs = vifs.sort_values("vif", ascending=False) - - # There are some features, we do not want to remove - required_features = [ - "walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap" - ] - - vifs = vifs[~vifs["features"].isin(required_features)] - drop_vifs = vifs[np.isinf(vifs["vif"])] - - # Acceptable drop variables: - # main-fuel_Gas: mains gas - # glazed-type_NO DATA! - # glazed-area_NO DATA! - - self.train_x = self.train_x.drop(columns=drop_vifs["features"].values) - self.test_x = self.test_x[self.train_x.columns] - - @staticmethod - def plot_regression(df): - # Extract the "fit" and "actual" columns from the dataframe - fit = df['fit'] - actual = df['actual'] - - # Create an array of x-values (assumed to be sequential integers) - x = np.arange(len(df)) - - # Plot the fit and actual data - plt.plot(x, fit, color='red', label='Fit') - plt.plot(x, actual, color='blue', label='Actual') - - # Set labels and title - plt.xlabel('Index') - plt.ylabel('Value') - plt.title('Linear Regression - Fit vs Actual') - - # Display legend - plt.legend() - - # Show the plot - plt.show() - - @staticmethod - def calculate_regression_metrics(y_true, y_pred, n=20): - """ - Calculate the 5 most important accuracy metrics for regression. - - Args: - y_true (array-like): Array of true target values. - y_pred (array-like): Array of predicted target values. - - Returns: - dict: Dictionary containing the calculated metrics. - """ - metrics = { - 'MAPE': mean_absolute_percentage_error(y_true, y_pred), - 'Mean Squared Error': mean_squared_error(y_true, y_pred), - 'Mean Absolute Error': mean_absolute_error(y_true, y_pred), - 'R2 Score': r2_score(y_true, y_pred), - 'Explained Variance Score': explained_variance_score(y_true, y_pred), - 'Median Absolute Error': median_absolute_error(y_true, y_pred) - } - - errors = pd.DataFrame() - errors['Fit'] = y_true - errors['Actual'] = y_pred - errors['Residual'] = errors['Actual'] - errors['Fit'] - errors['Absolute Residual'] = np.abs(errors['Residual']) - - worst_errors = errors.nlargest(n, 'Absolute Residual') - - return metrics, worst_errors diff --git a/model_data/analysis/UvalueEstimations.py b/model_data/analysis/UvalueEstimations.py deleted file mode 100644 index 0a1441bb..00000000 --- a/model_data/analysis/UvalueEstimations.py +++ /dev/null @@ -1,207 +0,0 @@ -import pickle -import pandas as pd -import numpy as np -from model_data.EpcClean import EpcClean - - -class UvalueEstimations: - def __init__(self, data: list): - """ - Initialize the UvalueEstimations class. - - :param data: The input data as a list of dictionaries, to be converted to a dataframe - """ - self.data = pd.DataFrame(data) - self.walls = None - self.walls_decile_data = {} - self.roofs = None - self.floors = None - self.floors_decile_data = {} - - def get_estimates(self, cleaner: EpcClean): - """ - Calculate U-value estimates for walls, roofs, and floors. - - :param cleaner: An instance of the EpcClean class used for cleaning data. - """ - self.set_walls(cleaner) - self.set_roofs(cleaner) - self.set_floors(cleaner) - - def set_walls(self, cleaner: EpcClean): - """ - Set U-value estimates for walls. - - :param cleaner: An instance of the EpcClean class used for cleaning data. - """ - walls_columns = [ - "local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form", - "total-floor-area", "number-habitable-rooms", "number-heated-rooms" - ] - - walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")] - - # Take just the columns we want - walls_df = walls_df[walls_columns] - walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float) - - walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area") - - # We now get the U-values - walls_df = walls_df.merge( - pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]], - how="left", - right_on="original_description", - left_on="walls-description" - ) - - u_value_summary = walls_df.groupby( - [ - "local-authority", - "property-type", - "walls-energy-eff", - "walls-env-eff", - "built-form", - "number-habitable-rooms", - "number-heated-rooms", - "total-floor-area_group" - ], - observed=True - ).agg({"thermal_transmittance": ["median", "size"]}).reset_index() - - u_value_summary.columns = [ - "local-authority", - "property-type", - "walls-energy-eff", - "walls-env-eff", - "built-form", - "number-habitable-rooms", - "number-heated-rooms", - "total-floor-area_group", - "median_thermal_transmittance", - "n_samples" - ] - - self.walls = u_value_summary - self.walls_decile_data = { - "decile_labels": decile_labels, - "decile_boundaries": decile_boundaries - } - - def set_roofs(self, cleaner: EpcClean): - """ - Set U-value estimates for roofs. - - :param cleaner: An instance of the EpcClean class used for cleaning data. - """ - pass - - def set_floors(self, cleaner: EpcClean): - """ - Set U-value estimates for floors. - - :param cleaner: An instance of the EpcClean class used for cleaning data. - """ - floors_columns = [ - "local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff", - "built-form", - "total-floor-area", "number-habitable-rooms", "number-heated-rooms" - ] - - floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")] - - # Take just the columns we want - floors_df = floors_df[floors_columns] - floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float) - - floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area") - - # We now get the U-values - floors_df = floors_df.merge( - pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]], - how="left", - right_on="original_description", - left_on="floor-description" - ) - - u_value_summary = floors_df.groupby( - [ - "local-authority", - "property-type", - "floor-energy-eff", - "floor-env-eff", - "built-form", - "number-habitable-rooms", - "number-heated-rooms", - "total-floor-area_group" - ], - observed=True - ).agg({"thermal_transmittance": ["median", "size"]}).reset_index() - - u_value_summary.columns = [ - "local-authority", - "property-type", - "floor-energy-eff", - "floor-env-eff", - "built-form", - "number-habitable-rooms", - "number-heated-rooms", - "total-floor-area_group", - "median_thermal_transmittance", - "n_samples" - ] - - self.floors = u_value_summary - self.floors_decile_data = { - "decile_labels": decile_labels, - "decile_boundaries": decile_boundaries - } - - @staticmethod - def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list): - """ - Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles. - - :param df: The input Pandas DataFrame. - :param column: The column name to break into deciles. - - :return: A tuple containing: - - The DataFrame with the decile group column. - - The list of decile labels. - - The list of decile boundaries. - """ - # Calculate decile boundaries - decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10)) - - # Create decile labels - decile_labels = [f"Decile {i + 1}" for i in range(10)] - - # Assign decile labels to existing values - df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels, - include_lowest=True) - - return df, decile_labels, decile_boundaries - - @staticmethod - def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list: - """ - Classify new values into existing deciles based on decile definitions. - - :param decile_boundaries: The list of decile boundaries. - :param decile_labels: The list of decile labels. - :param new_values: A list of new values to classify. - - :return: The classifications for the new values as a list. - """ - # Classify new values based on decile definitions - classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True) - return classifications.tolist() - - def _save(self, filename): - """ - Useful utility function to store this object, which is particularly handy for unit testing - :return: - """ - with open(filename, 'wb') as f: - pickle.dump(self, f) - \ No newline at end of file diff --git a/model_data/plotting/plotting_functions.py b/model_data/plotting/plotting_functions.py deleted file mode 100644 index 049ba248..00000000 --- a/model_data/plotting/plotting_functions.py +++ /dev/null @@ -1,40 +0,0 @@ -import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt - - -def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None): - """ - Create a heatmap plot based on a list of data and given variables. - - :param data: List of dictionaries, input data. - :param response_var: String, response variable to be plotted. - :param pivot_var1: String, first pivot variable to be used in the plot. - :param pivot_var2: String, second pivot variable to be used in the plot. - :param order1: List, the order of categories for pivot_var1. Optional. - :param order2: List, the order of categories for pivot_var2. Optional. - - Returns: - None. Displays the generated plot. - """ - - # Create a DataFrame from your list of dictionaries - df = pd.DataFrame(data) - - # Convert the response variable column to float type if it's not already - df[response_var] = df[response_var].astype(float) - - # Create a pivot table - pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var) - - # If an order is provided, reorder the pivot table - if order1 is not None: - pivot = pivot.reindex(order1) - if order2 is not None: - pivot = pivot[order2] - - # Plot the heatmap - plt.figure(figsize=(10, 6)) - sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm') - plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}") - plt.show() diff --git a/model_data/simulation_system/Dockerfiles/Dockerfile.prediction b/model_data/simulation_system/Dockerfiles/Dockerfile.prediction deleted file mode 100644 index 88cea6c4..00000000 --- a/model_data/simulation_system/Dockerfiles/Dockerfile.prediction +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.10.12-slim as release - -ARG USER=nroot -ARG UID=1000 -ARG GID=100 - -# Install patches -RUN apt-get update && apt-get upgrade -y \ - && apt-get install libgomp1 -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists - -# Install python packages -COPY ../requirements/predictions/predictions.txt requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Create and configure non-root user -RUN useradd \ - --create-home \ - --uid ${UID} \ - --gid ${GID} \ - --shell /bin/bash \ - ${USER} - -# Copy the project code to user home directory -COPY --chown=${UID}:${GID} ./core /home/simulation_system/core -COPY --chown=${UID}:${GID} ./MLModel /home/simulation_system/MLModel -COPY --chown=${UID}:${GID} ./predictions.py /home/simulation_system/predictions.py - -# FOR TESTING -# COPY --chown=${UID}:${GID} ./model_build_data /home/simulation_system/model_build_data - -# Switch user -USER ${USER} -WORKDIR /home/simulation_system - -# Run the python command -CMD ["python3", "predictions.py", "--data-path", "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet"] diff --git a/model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda b/model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda deleted file mode 100644 index f3708ed9..00000000 --- a/model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda +++ /dev/null @@ -1,22 +0,0 @@ -FROM public.ecr.aws/lambda/python:3.10 - -# Set the working directory -WORKDIR ${LAMBDA_TASK_ROOT}/simulation_system -ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}/simulation_system" - -# Install necessary build tools - required to test locally -RUN yum install -y gcc python3-devel - -# Install python packages -COPY requirements/predictions/predictions.txt ./requirements.txt -RUN pip install --no-cache-dir -r ./requirements.txt - -# Copy the project code to the working directory -COPY ./core ./core -COPY ./MLModel ./MLModel -COPY ./predictions.py ./predictions.py -COPY ./handlers/predictions_app.py ./predictions_app.py -COPY ./__init__.py ./__init__.py - -# Run off a lambda trigger -CMD [ "simulation_system.predictions_app.handler" ] diff --git a/model_data/simulation_system/Dockerfiles/Dockerfile.training b/model_data/simulation_system/Dockerfiles/Dockerfile.training deleted file mode 100644 index 1eadee7b..00000000 --- a/model_data/simulation_system/Dockerfiles/Dockerfile.training +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.10.12-slim as release - -ARG USER=nroot -ARG UID=1000 -ARG GID=100 - -# Install patches -RUN apt-get update && apt-get upgrade -y \ - && apt-get install libgomp1 -y \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists - -# Install python packages -COPY ../requirements/training/training.txt requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Create and configure non-root user -RUN useradd \ - --create-home \ - --uid ${UID} \ - --gid ${GID} \ - --shell /bin/bash \ - ${USER} - -# Copy the project code to user home directory -COPY --chown=${UID}:${GID} ./core /home/simulation_system/core -COPY --chown=${UID}:${GID} ./MLModel /home/simulation_system/MLModel -COPY --chown=${UID}:${GID} ./training.py /home/simulation_system/training.py - -# FOR TESTING -# COPY --chown=${UID}:${GID} ./model_build_data /home/simulation_system/model_build_data - -# Switch user -USER ${USER} -WORKDIR /home/simulation_system - -# Run the python command -CMD ["python3", "training.py", "--train-filepath", "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet", "--test-filepath", "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet"] diff --git a/model_data/simulation_system/Dockerfiles/Dockerfile.training.lambda b/model_data/simulation_system/Dockerfiles/Dockerfile.training.lambda deleted file mode 100644 index 34d3fe42..00000000 --- a/model_data/simulation_system/Dockerfiles/Dockerfile.training.lambda +++ /dev/null @@ -1,14 +0,0 @@ -FROM public.ecr.aws/lambda/python:3.10 - -# Install python packages -COPY ../requirements/training/training.txt requirements.txt -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the project code to user home directory -COPY ./core ${LAMBDA_TASK_ROOT}/simulation_system/core -COPY ./MLModel ${LAMBDA_TASK_ROOT}/simulation_system/MLModel -COPY ./training.py ${LAMBDA_TASK_ROOT}/simulation_system/training.py -COPY ./handlers/training_app.py ${LAMBDA_TASK_ROOT}/simulation_system/training_app.py - -# Run off a lambda trigger -CMD [ "training_app.handler" ] diff --git a/model_data/simulation_system/MLModel/BaseMLModel.py b/model_data/simulation_system/MLModel/BaseMLModel.py deleted file mode 100644 index aa10e26f..00000000 --- a/model_data/simulation_system/MLModel/BaseMLModel.py +++ /dev/null @@ -1,66 +0,0 @@ -""" -BaseMLModel class -This is the base protocol: -- Any implementation will be its own seperate file -Key tasks: -- Template Model class for different model types -- Save model -- Load Model -- Generate Inference -""" - -from numpy import ndarray -from pathlib import Path -from typing import Protocol, NamedTuple, Any -import pandas as pd -from training import S3FSClient - - -class MLModel(Protocol): - """ - Base ML Model protocol - """ - - def load_model(self, filepath: Path) -> None: - """ - Providing a path, this function will load the model to be used. Will load to internal variable - """ - - def save_model( - self, output_filepath: Path, s3_client: S3FSClient | None = None - ) -> None: - """ - Providing a path, this function will save the model to be used. - """ - - def train_model( - self, data: pd.DataFrame, target_column: str, hyperparameters: dict - ) -> None: - """ - For the given data and hyperparameters (specified to the model), a model is trained - """ - - def generate_predictions(self, data: pd.DataFrame) -> ndarray[Any, Any] | None: - """ - For the given dataframe, model is loaded and predictions are generated - """ - - def model_evaluation( - self, - validation_data: pd.DataFrame, - target_column: str, - metrics_location: Path | None = None, - ) -> pd.DataFrame | None: - """ - For any validation data, a set of predictions and metrics are return - """ - - def optimise_model_for_deployment(self): - """ - Perfomance post processing on Model to ensure ready for deployment - """ - - def model_metadata(self) -> dict | None: - """ - Extract out model metadata as dictionary - """ diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py deleted file mode 100644 index ac8f1409..00000000 --- a/model_data/simulation_system/MLModel/Models.py +++ /dev/null @@ -1,215 +0,0 @@ -""" -Different implementations of the MLModel Protocol -Uses the BaseMLModel protocol -Key tasks: -- Template Model class for different model types -- Save model -- Load Model -- Generate Inference -""" - -import os -from typing import Any -from pathlib import Path -import pandas as pd -from autogluon.tabular import TabularDataset, TabularPredictor -from core.Logger import logger -from core.Metrics import Metrics -from core.Settings import METRIC_FILENAME -from core.CloudClient import BotoClient - -AUTOGLUON_HYPERPARAMETERS = [ - "problem_type", - "eval_metric", - "time_limit", - "presets", - "excluded_model_types", -] - - -def model_factory(model_type: str, hyperparameters: dict) -> dict: - """ - Use factory pattern to register the different ML implementations - """ - - model_types = { - "autogluon": { - "model": AutogluonModel, - "naming_attributes": f"{hyperparameters['presets']}-{hyperparameters['time_limit']}", - }, - } - - return model_types[model_type] - - -class AutogluonModel: - """ - Autogluon model that implements the MLModel Protocol - """ - - def __init__(self, output_filepath: Path | None = None) -> None: - self.model = None - self.output_filepath = output_filepath - self.predictions = None - - def load_model( - self, - filepath: str | Path, - client: BotoClient, - model_folder: str = "local_model", - ) -> None: - """ - Providing a path, this function will load the model to be used. Will load to internal variable - """ - filepath = str(filepath) - if client.client is None: - logger.info("In local development mode - no need for s3 client") - self.model = TabularPredictor.load(path=filepath) - else: - logger.info(f"Loading model from s3 with filepath: %s and model_folder: %s" % (filepath, model_folder)) - client.download_model(filepath=filepath, model_folder=model_folder) - self.model = TabularPredictor.load(path=str(Path(model_folder) / filepath)) - - def save_model(self, output_filepath: Path, client: BotoClient) -> None: - """ - Providing a path, this function will save the model to be used. - """ - if client.client is None: - logger.info("In local development mode - no need for s3 client") - logger.info("Using AutoGluon Model - Model saving already occured") - else: - logger.info(f"Saving model into s3") - - self.directory_upload( - client=client, - local_directory=str(output_filepath), - bucket_name=client.model_bucket, - ) - - logger.info("Save complete") - - def directory_upload(self, client, local_directory, bucket_name): - - # Iterate through the local directory and upload each file - for root, dirs, files in os.walk(local_directory): - for file in files: - # Determine the local file path and S3 object key - local_file_path = os.path.join(root, file) - s3_object_key = os.path.relpath(local_file_path, local_directory) - - # Upload the file to S3 - client.client.upload_file(local_file_path, bucket_name, local_file_path) - - logger.info( - f"Uploaded {local_file_path} to {bucket_name}/{local_file_path}" - ) - - def train_model( - self, data: pd.DataFrame, target_column: str, hyperparameters: dict - ) -> None: - """ - For the given data and hyperparameters, a model is trained - """ - if self.output_filepath is None: - logger.error("Please specify a output_filepath in order to train a model") - exit(1) - - if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): - print( - "Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required" - ) - exit(1) - - AGdata = TabularDataset(data=data) - - self.model = TabularPredictor( - label=target_column, - path=self.output_filepath, - problem_type=hyperparameters["problem_type"], - eval_metric=hyperparameters["eval_metric"], - ).fit( - AGdata, - time_limit=hyperparameters["time_limit"], - presets=hyperparameters["presets"], - excluded_model_types=hyperparameters["excluded_model_types"], - ) - - def generate_predictions(self, data: pd.DataFrame) -> pd.Series: - """ - For the given dataframe, model is loaded and predictions are generated - """ - - if self.model is None: - print("No model loaded/ trained") - exit(1) - - predictions = pd.Series(self.model.predict(data)) - - return predictions - - def model_evaluation( - self, - validation_data: pd.DataFrame, - target_column: str, - metrics: Metrics, - metrics_location: Path | None = None, - metric_filename: str = METRIC_FILENAME, - ) -> pd.DataFrame: - """ - For any validation data, a set of predictions and metrics are return - """ - if metrics_location is None: - logger.warning("Metrics will be outputted to current folder") - metrics_location = Path() - - if self.model is None: - logger.error("No model loaded/ trained - Unable to generate evaluation") - exit(1) - - # Generate prediction, load metrics suite, generate metrics betweeen the two - predictions = self.generate_predictions(validation_data) - - performance = metrics.generate_metric_suite( - actuals=validation_data[target_column], predictions=predictions - ) - - logger.info("Prediction used for evaluations are saved in self.prediction") - self.predictions = predictions - - logger.info("Saving metric file as metric.csv") - metrics_location.mkdir(exist_ok=True) - - metrics_df = pd.DataFrame([performance]) - metrics_df.to_csv(metrics_location / metric_filename) - markdown_filename = metric_filename.split(".")[0] + ".md" - metrics_df.to_markdown(metrics_location / markdown_filename) - - return metrics_df - - def optimise_model_for_deployment( - self, deployment_path: Path | str | None = None - ) -> Any: - """ - We can optimise the deployment for a autogluon model - """ - if self.model is None: - raise ValueError("No model to optimise for deployment") - - if deployment_path is None: - raise ValueError("Deployment path required") - - deployment_path = str(deployment_path) - - # This will return a string path of the location - return self.model.clone_for_deployment(deployment_path) - - def model_metadata(self) -> dict[str, Any]: - """ - For Autogluon model, use the inbuilt model info method - """ - - if self.model is None: - logger.error("No Model loaded/ trained") - exit(1) - - return self.model.info() diff --git a/model_data/simulation_system/Makefile b/model_data/simulation_system/Makefile deleted file mode 100644 index 4aad42b1..00000000 --- a/model_data/simulation_system/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -export PYENV_ROOT=$(HOME)/.pyenv -export PATH := $(PYENV_ROOT)/bin:$(PATH) -PYTHON_VERSION ?= 3.10.12 - -.PHONY: init -init: build docker - -.PHONY: training_env -env: - pyenv install $(PYTHON_VERSION) || echo "Proceeding..." # || is to swallow non-zero response if python version already is installed - pyenv global $(PYTHON_VERSION) - python3 -m venv .training_env - . .training_env/bin/activate - pip install --upgrade pip - pip install -r requirements/training/training-dev.txt && pre-commit install - - echo " --- TO ACTIVATE THE ENVIRONMENT --- " - echo "Run source .training_env/bin/activate to activate the virtual environment" - -.PHONY: check-all -check-all: pre-commit run -a - -.PHONY: build -build: - docker-compose build - -.PHONY: docker -docker: - docker-compose up -d - -.PHONY: down -down: - docker compose down diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md deleted file mode 100644 index e98767ea..00000000 --- a/model_data/simulation_system/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# Simulation System - -Starter Readme: -Steps for pipeline: - -- (WIP) Set up the training development environment - - Change directory to this folder (simulation_system) - - Run the following command `make env PYTHON_VERSION=3.10.12` - - This will install the specified python version using `pyenv` and select this version as the global python version - - It will install all training packages as specified in the training-dev.txt requirements file, including the pre-commit hooks - - Run `source .training_env/bin/activate` to use this environment - -- (WIP) Use Makefile to start up mock up s3 service - - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service - - This docker compose is running in detached mode `-d`, so will no output anything to the terminal - -- Once the Minio service is run, you can run the `training.py` file to start a model build process - - This will output a model, for a given target column, and add model name composed of some of the hyperparameters - - An example of running this file is: - - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet` - - Outputs of the pipeline are: - - A model directory bucket - - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) - - A model type prefix (i.e. autogluon, tensorflow etc) - - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP) - - This model name is made up of target variable, quality, time spent training and timestamp - - Within this prefix, there are three folders: - - model - - The model path that can be loaded in the codebase - - deployment - - The optimised model that can be deployed (may or maynot need this) - - metrics - - The metrics generatted from the model (may or may not need this as this can be contained in the registry) - -- Once model build is finished, you can run the `prediction.py` file to generate prediction - - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry - - This can be overwritten by specifying a model_path, which will load an alternative model - - There are two ways of getting data into the pipeline: - - Using the `--data` argument: - - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'` - - Note the single and double quotation marks, as this affects the ingestion - - Using the `--data-path` argument: - - This can be a filepath (Can imagine that we might want to pull data from S3/ DB) - - An example of running the file is: - - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet` - - Outputs of the pipeline are: - - prediction bucket - - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) - - a uprn prefix (i.e 0123456789) - - a `prediction.json` - - a `metadata.json` - - This is all the metadata from the model (can change this if needed) - -- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file - - It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify. - - I.e. the hyperparameters for models are in here but will move into a separate configuration file - - -# TODO: -- Structure/ MLOps: - - Add configuration files (dev, staging, prod), including hyperparamters - - Add precommit hooks (linters, branch names, etc) - - Sphinx documentation - - Sort out local mock up services - - Sort out Model Registry - - Sort out Data version control - - pre-commit hooks: - - The types of hooks that we want (safety, bandit, iso8 etc) - - The customisations we require - - Add sphinx documentation -- Data Science: - - Implement a metrics class, to hold all metric - - Rebuild metrics script (Could be a one off but good to have) - - Determine metrics - - Implement and test custom model (Tensorflow Decision Trees etc) -- Orchestration: - - Lambda handler for the pipeline diff --git a/model_data/simulation_system/docker-compose.yml b/model_data/simulation_system/docker-compose.yml deleted file mode 100644 index f875e7ef..00000000 --- a/model_data/simulation_system/docker-compose.yml +++ /dev/null @@ -1,58 +0,0 @@ -version: '3' - -services: - minio: - image: minio/minio:RELEASE.2022-05-26T05-48-41Z - ports: - - "9000:9000" - - "9001:9001" - volumes: - - ./data:/data - environment: - MINIO_ROOT_USER: &MINIO_USER admin - MINIO_ROOT_PASSWORD: &MINIO_PASS password - command: server --console-address ":9001" /data - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] - interval: 30s - timeout: 20s - retries: 3 - - simulation_system_training: - build: - context: ./ - dockerfile: ./Dockerfiles/Dockerfile.training - image: simulation_system_training - environment: - RUNTIME_ENVIRONMENT: local-mock - ENDPOINT_URL: http://minio:9000/ - AWS_ACCESS_KEY_ID: *MINIO_USER - AWS_SECRET_ACCESS_KEY: *MINIO_PASS - tty: true - depends_on: - minio: - condition: service_healthy - # command: - # ["bash"] - - # simulation_system_prediction: - # build: - # context: ./ - # dockerfile: ./Dockerfiles/Dockerfile.prediction - # image: simulation_system_prediction - # environment: - # RUNTIME_ENVIRONMENT: local-mock - # ENDPOINT_URL: http://minio:9000/ - # AWS_ACCESS_KEY_ID: *MINIO_USER - # AWS_SECRET_ACCESS_KEY: *MINIO_PASS - # tty: true - # depends_on: - # simulation_system_training: - # condition: service_completed_successfully - # command: - # ["bash"] - - - -# volumes: -# minio_storage: {} diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py deleted file mode 100644 index ab29f39c..00000000 --- a/model_data/simulation_system/energy_predictor.py +++ /dev/null @@ -1,118 +0,0 @@ -from pathlib import Path -from core.Settings import ( - RDSAP_RESPONSE, - FLOOR_LEVEL_MAP, - BUILT_FORM_REMAP, - EARLIEST_EPC_DATE, - FULLY_GLAZED_DESCRIPTIONS, - FIXED_FEATURES, - LATEST_FIELD, - COMPONENT_FEATURES -) -from model_data.BaseUtility import Definitions -from tqdm import tqdm -import pandas as pd -import numpy as np - -from autogluon.tabular import TabularDataset, TabularPredictor - -RANDOM_SEED = 0 - -DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' - -FLOAT_COLUMNS = [ - 'NUMBER_OPEN_FIREPLACES', - 'EXTENSION_COUNT', - 'TOTAL_FLOOR_AREA', - 'PHOTO_SUPPLY', - 'FIXED_LIGHTING_OUTLETS_COUNT', - 'FLOOR_HEIGHT', - 'NUMBER_HABITABLE_ROOMS', - 'LOW_ENERGY_LIGHTING', - 'MULTI_GLAZE_PROPORTION', - 'NUMBER_HEATED_ROOMS' -] - - -def create_raw_data(): - """ - Extract all information to do a simple predictor for RDSAP - """ - - directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - # directories = directories[0:10] - dfs = [] - for directory in tqdm(directories): - filepath = directory / "certificates.csv" - df = pd.read_csv(filepath, low_memory=False) - - # Remove any bad uprns and ignore old/bad data - df = df[~pd.isnull(df["UPRN"])] - df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - df = df[df["TRANSACTION_TYPE"] != "new dwelling"] - df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] - - # Change multi glaze proportion - no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & ( - df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) - df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 - - # Recast - df["UPRN"] = df["UPRN"].astype(int).astype(str) - df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float) - - # Sort Data - df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) - - # Map all anomaly values to None - data_anomaly_map = dict(zip(Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES))) - - # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values - df = df.replace(data_anomaly_map) - df = df.replace(np.NAN, None) - - # Remap certain columns - df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) - df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) - - # Keep only possible modelling columns - df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))] - - # Reduce memory usage - - # df.memory_usage() - # df.dtypes - df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned') - df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float') - - dfs.append(df) - - data = pd.concat(dfs) - data.to_parquet('./energy_predictor_data.parquet') - - cleaned_data = data.dropna() - # GIves you primarily flats - cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet') - - -def main(): - data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet') - - subsample_size = round(len(data) / 100) - data = data.sample(subsample_size, random_state=RANDOM_SEED) - - predictor_RDSAP = TabularPredictor( - label=RDSAP_RESPONSE, - path="agModels-predictENERGY", - problem_type="regression", - eval_metric='mean_absolute_error' - ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) - - test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet') - performance = predictor_RDSAP.evaluate(test_data) - predictions = predictor_RDSAP.predict(test_data) - predictor_RDSAP.feature_importance(test_data) - - -if __name__ == "__main__": - main() diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py deleted file mode 100644 index 1b3827d8..00000000 --- a/model_data/simulation_system/predictions.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -Script to load MLModel class and generate predictions -""" - -import os -import json -import argparse -from pathlib import Path -import pandas as pd -from typing import Optional -from datetime import datetime -from MLModel.Models import AutogluonModel -from core.Logger import logger -from core.DataLoader import dataloader_factory -from core.CloudClient import BotoClient -from core.Metrics import Metrics -from core.RegistryHandler import RegistryHandler -from core.Settings import ( - BASE_REGISTRY_PATH, - REGISTRY_FILE, - PREDICTION_LOCATION, - PREDICTION_FILE, - METADATA_FILE, - TIMESTAMP_FORMAT, - MODEL_DIRECTORY, -) - -TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) -RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev") - -CLIENT = BotoClient(runtime_environment=RUNTIME_ENVIRONMENT) - -# FOR TESTING -# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to -# DataFrame) -# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") -# DATA = TEST_DATA.sample(1) - -# For testing in dev s3 -# Data path can be passed as so: -# python3 predictions.py --data-path s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet -# data_path="s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet" - - -def ingest_arguments() -> argparse.Namespace: - """ - Helper function to take in arguments from script start - """ - - parser = argparse.ArgumentParser(description="Inputs for training script") - parser.add_argument( - "--target-column", - type=str, - help="The response variable you are predicting for", - choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], - default="RDSAP_CHANGE", - ) - parser.add_argument( - "--model-path", - type=str, - help="If you wish to use a specific model, specify the model path here", - ) - parser.add_argument("--data", type=str, help="Json data for predictions") - parser.add_argument( - "--data-path", type=str, help="Location of Parquet dataset to load for training" - ) - - args = parser.parse_args() - - return args - - -def prediction( - target_column: str = "RDSAP_CHANGE", - model_path: str | None = None, - data: Optional[pd.DataFrame | str] = None, - data_path: Optional[str] = None, -): - """ - Main pipeline function - """ - - if model_path is not None: - logger.info("User specified a model to load - ignoring registry") - model_location = model_path - model_type = model_path - model_name = model_path - else: - # TODO: Think about where registry will sit/ type - logger.info("Loading best model from registry") - - metrics = Metrics() - registry_handler = RegistryHandler() - - registry_path = Path(MODEL_DIRECTORY) / target_column / REGISTRY_FILE - - registry_df = registry_handler.load_registry( - registry_path=registry_path, client=CLIENT, metrics=metrics - ) - - # registry_df = pd.read_csv(registry_path) - best_model_df = registry_df[registry_df["best_model"]] - - model_location = best_model_df["model_location"].values[0] - model_type = best_model_df["model_type"].values[0] - model_name = best_model_df["model_name"].values[0] - - logger.info("--- Model Info: ---") - logger.info(f"Model type: {model_type}") - logger.info(f"Model name: {model_name}") - logger.info(f"Model location: {model_location}") - - logger.info("--- Loading Data ---") - if data is None and data_path is None: - logger.error("No Data/Data Path passed") - exit(1) - if data_path and data is None: - logger.info("Loading data from provided path") - dataloader = dataloader_factory(runtime_environment=RUNTIME_ENVIRONMENT) - data = dataloader.load(client=CLIENT, filepath=data_path, index_col=None) - - if data is None: - raise ValueError("No data loaded") - - else: - logger.info("Using data provided") - data = json.loads(str(data)) - data = pd.DataFrame([data]) - - logger.info("--- Loading Model ---") - - if model_type == "autogluon": - logger.info("Using an Autogluon model") - model = AutogluonModel() - else: - raise ValueError("No other model currently") - - # In lambda, only the /tmp folder is writable - model_folder = "/tmp" if RUNTIME_ENVIRONMENT in ["dev", "prod"] else "local_model" - - model.load_model(filepath=model_location, client=CLIENT, model_folder=model_folder) - - logger.info("--- Generating Predictions ---") - prediction = model.generate_predictions(data=data) - - # logger.info(pd.concat([data["id"], prediction], axis=1)) - - return pd.concat([data["id"], prediction], axis=1) - - # Save prediction some where? - # prediction.to_csv("s3?") - - # TODO: Check how we want to structure outputs - # For now, just categorise by uprn and timestamp - # Assume one uprn coming in for now - # uprn = data.index.values[0] - - # # Saving prediction local for now - # # TODO: change uprn to TARGET_ID, put in setting - # logger.info("--- Outputting prediction and metadata --- ") - # output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP - # output_base.mkdir(parents=True, exist_ok=True) - - # json_prediction = prediction.to_json(output_base / PREDICTION_FILE) - # prediction_metadata = { - # "model_type": model_type, - # "model_name": model_name, - # "model_location": model_location, - # "model_settings": model.model_metadata(), - # } - - # pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE) - - return json_prediction - - -if __name__ == "__main__": - args = ingest_arguments() - - # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' - # Data path can be passed as so: python3 predictions.py --data-path - # ./model_build_data/change_data/rdsap_full/test_data.parquet - prediction( - target_column=args.target_column, - model_path=args.model_path, - data=args.data, - data_path=args.data_path, - ) diff --git a/model_data/simulation_system/regenerate_metrics.py b/model_data/simulation_system/regenerate_metrics.py deleted file mode 100644 index 025892f5..00000000 --- a/model_data/simulation_system/regenerate_metrics.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Script to regenerate metrics for all the models in the model registry -Key task: -- Load model registry -- For each model in the registry, generate the metrics (Key questions here is what if the test data changes) -- Save the new metrics out to s3 bucket -""" - -import os -import argparse -from s3pathlib import S3Path -import pandas as pd -from tqdm import tqdm -from core.Logger import logger -from core.Metrics import Metrics, sort_by_metric -from core.DataLoader import dataloader_factory -from core.Settings import ( - OPTIMISE_METRIC, - MODEL_DIRECTORY, - REGISTRY_FILE, - BEST_MODEL_COLUMN_NAME, -) -from MLModel.Models import model_factory - -RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") - - -def ingest_arguments() -> argparse.Namespace: - """ - Helper function to take in arguments from script start - """ - - parser = argparse.ArgumentParser(description="Inputs for training script") - - parser.add_argument( - "--test-filepath", - type=str, - help="Location of Parquet dataset to load for testing", - required=True, - ) - parser.add_argument( - "--target-column", - type=str, - help="The response variable", - choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], - default="RDSAP_CHANGE", - ) - - args = parser.parse_args() - - return args - - -def regenerate_metrics(test_filepath: str, target_column: str) -> None: - """ - Recreate all metrics for all models - """ - - logger.info("--- Loading test data ---") - dataloader = dataloader_factory(runtime_environment=RUNTIME_ENVIRONMENT) - test_df = dataloader.load(filepath=test_filepath) - - logger.info("--- Loading model registry ---") - logger.info(f"Loading registry for {target_column} models") - registry_df = dataloader.load( - filepath=S3Path(MODEL_DIRECTORY, target_column, REGISTRY_FILE).uri - ) - - logger.info("Extract non-metric columns") - registry_df = registry_df[["model_type", "model_name", "model_location"]] - - logger.info("--- Regenerating metrics ---") - - metric_suite = Metrics() - - metrics_df = pd.DataFrame(columns=metric_suite.list_metric_functions()) - - for _, row in tqdm(registry_df.iterrows()): - - logger.info(f"--- Loading Model ({row['model_name']}) ---") - - model = model_factory(model_type=row["model_type"])() - - model.load_model(filepath=row["model_location"]) - - metrics = metric_suite.generate_metric_suite( - model=model, data=test_df, target_column=target_column - ) - - # Add metrics row by row - metrics_df = pd.concat([metrics_df, metrics], axis=0).reset_index(drop=True) - - # Add metrics df to registry df side by side - registry_df = pd.concat([registry_df, metrics_df], axis=1) - - logger.info(f"--- Sorting by Optimise Metric ({OPTIMISE_METRIC}) ---") - - registry_df = sort_by_metric( - data=registry_df, - optimse_metric=OPTIMISE_METRIC, - best_model_column_name=BEST_MODEL_COLUMN_NAME, - ) - - logger.info("--- Saving model metrics ---") - - registry_df.to_csv(S3Path(MODEL_DIRECTORY, target_column, REGISTRY_FILE).uri) - - -if __name__ == "__main__": - - logger.info("---Begin Pipeline to regenerate metrics---") - - logger.info("---Ingest Arguments---") - args = ingest_arguments() - - regenerate_metrics( - test_filepath=args.test_filepath, target_column=args.target_column - ) diff --git a/model_data/simulation_system/requirements.txt b/model_data/simulation_system/requirements.txt deleted file mode 100644 index 62db1719..00000000 --- a/model_data/simulation_system/requirements.txt +++ /dev/null @@ -1,208 +0,0 @@ -absl-py==1.4.0 -accelerate==0.16.0 -aiohttp==3.8.5 -aiohttp-cors==0.7.0 -aiosignal==1.3.1 -aliyun-python-sdk-core==2.13.36 -aliyun-python-sdk-kms==2.16.1 -antlr4-python3-runtime==4.9.3 -asttokens==2.2.1 -async-timeout==4.0.3 -attrs==23.1.0 -autogluon==0.8.2 -autogluon.common==0.8.2 -autogluon.core==0.8.2 -autogluon.features==0.8.2 -autogluon.multimodal==0.8.2 -autogluon.tabular==0.8.2 -autogluon.timeseries==0.8.2 -backcall==0.2.0 -beautifulsoup4==4.12.2 -blessed==1.20.0 -blis==0.7.10 -boto3==1.28.25 -botocore==1.31.25 -cachetools==5.3.1 -catalogue==2.0.9 -catboost==1.2 -certifi==2023.7.22 -cffi==1.15.1 -charset-normalizer==3.2.0 -click==8.1.6 -cloudpickle==2.2.1 -colorama==0.4.6 -colorful==0.5.5 -comm==0.1.4 -confection==0.1.1 -contourpy==1.1.0 -crcmod==1.7 -cryptography==41.0.3 -cycler==0.11.0 -cymem==2.0.7 -datasets==2.14.4 -debugpy==1.6.7 -decorator==5.1.1 -defusedxml==0.7.1 -dill==0.3.7 -distlib==0.3.7 -evaluate==0.3.0 -executing==1.2.0 -fastai==2.7.12 -fastcore==1.5.29 -fastdownload==0.0.7 -fastprogress==1.0.3 -filelock==3.12.2 -fonttools==4.42.0 -frozenlist==1.4.0 -fsspec==2023.6.0 -future==0.18.3 -gdown==4.7.1 -gluonts==0.13.3 -google-api-core==2.11.1 -google-auth==2.22.0 -google-auth-oauthlib==1.0.0 -googleapis-common-protos==1.60.0 -gpustat==1.1 -graphviz==0.20.1 -grpcio==1.50.0 -huggingface-hub==0.16.4 -hyperopt==0.2.7 -idna==3.4 -imageio==2.31.1 -ipykernel==6.25.1 -ipython==8.14.0 -jedi==0.19.0 -Jinja2==3.1.2 -jmespath==0.10.0 -joblib==1.3.2 -jsonschema==4.17.3 -jupyter_client==8.3.0 -jupyter_core==5.3.1 -kiwisolver==1.4.4 -langcodes==3.3.0 -lightgbm==3.3.5 -lightning-utilities==0.9.0 -llvmlite==0.40.1 -Markdown==3.4.4 -markdown-it-py==3.0.0 -MarkupSafe==2.1.3 -matplotlib==3.7.2 -matplotlib-inline==0.1.6 -mdurl==0.1.2 -mlforecast==0.7.3 -model-index==0.1.11 -msgpack==1.0.5 -multidict==6.0.4 -multiprocess==0.70.15 -murmurhash==1.0.9 -nest-asyncio==1.5.7 -networkx==3.1 -nlpaug==1.1.11 -nltk==3.8.1 -nptyping==2.4.1 -numba==0.57.1 -numpy==1.24.4 -nvidia-ml-py==12.535.77 -oauthlib==3.2.2 -omegaconf==2.2.3 -opencensus==0.11.2 -opencensus-context==0.1.3 -opendatalab==0.0.10 -openmim==0.3.9 -openxlab==0.0.17 -ordered-set==4.1.0 -oss2==2.17.0 -packaging==23.1 -pandas==1.5.3 -parso==0.8.3 -pathy==0.10.2 -patsy==0.5.3 -pexpect==4.8.0 -pickleshare==0.7.5 -Pillow==9.5.0 -platformdirs==3.10.0 -plotly==5.16.0 -preshed==3.0.8 -prometheus-client==0.17.1 -prompt-toolkit==3.0.39 -protobuf==3.20.2 -psutil==5.9.5 -ptyprocess==0.7.0 -pure-eval==0.2.2 -py-spy==0.3.14 -py4j==0.10.9.7 -pyarrow==12.0.1 -pyasn1==0.5.0 -pyasn1-modules==0.3.0 -pycparser==2.21 -pycryptodome==3.18.0 -pydantic==1.10.12 -Pygments==2.16.1 -pyparsing==3.0.9 -pyrsistent==0.19.3 -PySocks==1.7.1 -pytesseract==0.3.10 -python-dateutil==2.8.2 -pytorch-lightning==1.9.5 -pytorch-metric-learning==1.7.3 -pytz==2023.3 -PyWavelets==1.4.1 -PyYAML==6.0.1 -pyzmq==25.1.0 -ray==2.3.1 -regex==2023.8.8 -requests==2.28.2 -requests-oauthlib==1.3.1 -responses==0.18.0 -rich==13.4.2 -rsa==4.9 -s3transfer==0.6.1 -safetensors==0.3.2 -scikit-image==0.19.3 -scikit-learn==1.2.2 -scipy==1.11.1 -seaborn==0.12.2 -sentencepiece==0.1.99 -seqeval==1.2.2 -six==1.16.0 -smart-open==6.3.0 -soupsieve==2.4.1 -spacy==3.6.1 -spacy-legacy==3.0.12 -spacy-loggers==1.0.4 -srsly==2.4.7 -stack-data==0.6.2 -statsforecast==1.4.0 -statsmodels==0.14.0 -tabulate==0.9.0 -tenacity==8.2.2 -tensorboard==2.14.0 -tensorboard-data-server==0.7.1 -tensorboardX==2.6.2 -text-unidecode==1.3 -thinc==8.1.12 -threadpoolctl==3.2.0 -tifffile==2023.7.18 -timm==0.9.5 -tokenizers==0.13.3 -toolz==0.12.0 -torch==1.13.1 -torchmetrics==0.11.4 -torchvision==0.14.1 -tornado==6.3.2 -tqdm==4.65.1 -traitlets==5.9.0 -transformers==4.26.1 -typer==0.9.0 -typing_extensions==4.7.1 -tzdata==2023.3 -ujson==5.8.0 -urllib3==1.26.16 -virtualenv==20.24.3 -wasabi==1.1.2 -wcwidth==0.2.6 -Werkzeug==2.3.6 -window-ops==0.0.14 -xgboost==1.7.6 -xxhash==3.3.0 -yarl==1.9.2 diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py deleted file mode 100644 index 8989d491..00000000 --- a/model_data/simulation_system/test_data_generation.py +++ /dev/null @@ -1,109 +0,0 @@ -from model_data.simulation_system.core.Logger import logger -import argparse -import pandas as pd -from pathlib import Path -from model_data.simulation_system.core.Settings import RANDOM_SEED, TRAIN_AND_VALIDATION_DATA_NAME, TEST_DATA_NAME - - -def ingest_arguments() -> argparse.Namespace: - """ - Helper function to take in arguments from script start - """ - - parser = argparse.ArgumentParser(description="Inputs for training script") - - parser.add_argument( - "--filepath", - type=str, - help="Location of Parquet dataset to load", - required=True, - ) - parser.add_argument( - "--output-folder", - type=str, - help="Location of Parquet dataset to save", - required=True, - ) - parser.add_argument( - "--percentage", - type=float, - help="Percentage of data to use as test data", - default=None, - ) - parser.add_argument( - "--volume", type=int, help="Volume of data to use as test data", default=None - ) - parser.add_argument( - "--sampling", - type=str, - help="Type of sampling to do for test data", - choices=["random", "stratified"], - default="random", - ) - - args = parser.parse_args() - - return args - - -def main( - filepath: str, output_folder: str, percentage: float, volume: int, sampling: str -): - """ - Load a dataset in and split out the training+validation data and the test data. - """ - - logger.info("---Loading Data---") - data = pd.read_parquet(filepath).reset_index(drop=True) - - if percentage and volume is None: - test_amount = round(len(data) * percentage) - elif percentage is None and volume: - test_amount = volume - elif percentage is None and volume is None: - logger.error( - "No amount specified - please specify either a percentage or volume" - ) - exit(1) - else: - logger.info("Both percentage and volume specified - taking largest of the two") - test_amount = max(round(len(data) * percentage), volume) - - logger.info(f"---Extracting {test_amount} from dataset to be test data") - - train_validation_data = pd.DataFrame() - test_data = pd.DataFrame() - - if sampling == "random": - logger.info("--- Using random sample method ---") - sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index - - train_validation_data = data.drop(sample_index) - test_data = data.iloc[sample_index] - - elif sampling == "stratified": - # Not yet implemented - pass - - logger.info("--- Saving data ---") - - train_validation_data.to_parquet( - Path(output_folder) / TRAIN_AND_VALIDATION_DATA_NAME - ) - test_data.to_parquet(Path(output_folder) / TEST_DATA_NAME) - - logger.info(" ---Pipeline complete---") - - -if __name__ == "__main__": - logger.info("--- Generate test data pipeline ---") - - args = ingest_arguments() - - main( - filepath=args.filepath, - output_folder=args.output_folder, - percentage=args.percentage, - volume=args.volume, - sampling=args.sampling, - ) diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py deleted file mode 100644 index b68d1c4d..00000000 --- a/model_data/simulation_system/training.py +++ /dev/null @@ -1,259 +0,0 @@ -import argparse - -import os -import shutil -from pathlib import Path -from datetime import datetime -import pandas as pd -from MLModel.Models import model_factory -from core.Logger import logger -from core.Metrics import Metrics, sort_by_metric -from core.DataLoader import dataloader_factory -from core.FeatureProcessor import FeatureProcessor -from core.CloudClient import BotoClient -from core.RegistryHandler import RegistryHandler -from core.Settings import ( - MODEL_DIRECTORY, - REGISTRY_FILE, - MODEL_FOLDER, - METRICS_FOLDER, - DEPLOYMENT_FOLDER, - SUBSAMPLE_FACTOR, - MODEL_HYPERPARAMETERS, - TIMESTAMP_FORMAT, - RESIDUAL_FILE, - BEST_MODEL_COLUMN_NAME, - OPTIMISE_METRIC, -) - -TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) - -RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") - -CLIENT = BotoClient(runtime_environment=RUNTIME_ENVIRONMENT) -# CLIENT = S3FSClient(runtime_environment=RUNTIME_ENVIRONMENT) - - -# FOR TESTING -# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" -# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet" -# target_column = "RDSAP_CHANGE" -# model_type = "autogluon" -# hyperparameter = HYPERPARAMETERS -# SUBSAMPLE_FACTOR = 200 - -# Mock location -# train_filepath = "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet" -# test_filepath = "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet" - - -# To run script in local mode: -# python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet - -# To run script in local-mock mode: -# python3 training.py --train-filepath s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet - - -def ingest_arguments() -> argparse.Namespace: - """ - Helper function to take in arguments from script start - """ - - parser = argparse.ArgumentParser(description="Inputs for training script") - - parser.add_argument( - "--train-filepath", - type=str, - help="Location of Parquet dataset to load for training", - required=True, - ) - parser.add_argument( - "--test-filepath", - type=str, - help="Location of Parquet dataset to load for testing", - required=True, - ) - parser.add_argument( - "--model-type", - type=str, - help="The type of model to train", - choices=["autogluon"], - default="autogluon", - ) - parser.add_argument( - "--target-column", - type=str, - help="The response variable", - choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], - default="RDSAP_CHANGE", - ) - - args = parser.parse_args() - - return args - - -def training( - train_filepath: str, - test_filepath: str, - target_column: str = "RDSAP_CHANGE", - model_type: str = "autogluon", - hyperparameters: dict | None = None, -) -> None: - """ - Pipeline to run training on the dataset - """ - - logger.info("--- Loading data ---") - dataloader = dataloader_factory(runtime_environment=RUNTIME_ENVIRONMENT) - train_df = dataloader.load(client=CLIENT, filepath=train_filepath) - test_df = dataloader.load(client=CLIENT, filepath=test_filepath) - - if train_df is None or test_df is None: - raise ValueError("No data Loaded - cancelling pipeline") - - logger.info("--- Feature processing ---") - - feature_processor = FeatureProcessor() - - # This is for convenience for now - subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR) - - train_df = feature_processor.process( - train_df, target_column=target_column, subsample_amount=subsample_amount - ) - test_df = feature_processor.process(test_df, target_column=target_column) - - logger.info("--- Build Model ---") - - logger.info("--- Load Hyperparameters ---") - - if hyperparameters is None: - logger.info("Use base hyperparameters in settings") - hyperparameters = MODEL_HYPERPARAMETERS[model_type] - logger.info(f"Hyperparameters are: {hyperparameters}") - - logger.info( - "--- Loading model configuration (Model type and Naming convention) ---" - ) - # We might want to have hyperparameters in the names to make models more recognisable - model_toolkit = model_factory( - model_type=model_type, hyperparameters=hyperparameters - ) - - model_root = ( - f"{target_column}-{model_toolkit['naming_attributes']}-{TIMESTAMP}".lower() - ) - output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root - - # Will need to pass output path to model (for saving purposes) - model = model_toolkit["model"](output_filepath=output_base / MODEL_FOLDER) - - model.train_model( - data=train_df, target_column=target_column, hyperparameters=hyperparameters - ) - - logger.info("--- Save Model ---") - model.save_model(output_filepath=model.output_filepath, client=CLIENT) - - logger.info("--- Generate evaluation metrics ---") - metrics = Metrics() - - metric_output_path = output_base / METRICS_FOLDER - metrics_df = model.model_evaluation( - validation_data=test_df, - target_column=target_column, - metrics_location=metric_output_path, - metrics=metrics, - ) - - logger.info("--- Generate metric outputs using predictions ---") - # metrics.generate_plot_suite() - - plot_output_path = output_base / METRICS_FOLDER / RESIDUAL_FILE - metrics.generate_residual_plot( - actuals=test_df[target_column], - predictions=model.predictions, - target_column=target_column, - output_filepath=plot_output_path, - ) - - metrics.upload_metrics(output_filepath=metric_output_path, client=CLIENT) - - # TODO: for cml, we might want to have class that outputs all data and plots to add to the report - # If we want residual plot/ any plots, we will need to self host - - # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment - # Imagining for now that the model trained here is the best model amongst all models built - - logger.info("--- Optimising model for deployment ---") - - deployment_model_path = output_base / DEPLOYMENT_FOLDER - - model.optimise_model_for_deployment(deployment_path=deployment_model_path) - logger.info( - f"Optimised version of best model can be found at: {deployment_model_path}" - ) - - model.save_model(output_filepath=deployment_model_path, client=CLIENT) - - # TODO: Need a model registry - for now have this as a CSV - # Save this in the model directory - # Loading registry from s3 - logger.info("--- Append registry with new model ---") - - registry_handler = RegistryHandler() - - registry_path = Path(MODEL_DIRECTORY) / target_column / REGISTRY_FILE - - registry_df = registry_handler.load_registry( - registry_path=registry_path, client=CLIENT, metrics=metrics - ) - - model_details_df = pd.DataFrame( - [ - { - "model_type": model_type, - "model_name": model_root, - "model_location": deployment_model_path, - } - ] - ) - - registry_row = pd.concat([model_details_df, metrics_df], axis=1) - registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) - - registry_df = sort_by_metric( - registry_df, - optimse_metric=OPTIMISE_METRIC, - best_model_column_name=BEST_MODEL_COLUMN_NAME, - ) - - logger.info("--- Saving new model to registry ---") - # Ensure the directory exists - registry_path.parent.mkdir(parents=True, exist_ok=True) - registry_df.to_csv(registry_path, index=False) - - registry_handler.save_registry(output_filepath=registry_path, client=CLIENT) - - logger.info("--- Clean up ---") - if RUNTIME_ENVIRONMENT != "local" and Path(MODEL_DIRECTORY).exists(): - logger.info("Removing local development files not in s3") - shutil.rmtree(Path(MODEL_DIRECTORY)) - - logger.info("--- Training Pipeline Complete --- ") - - -if __name__ == "__main__": - - logger.info("---Begin Pipeline---") - - logger.info("---Ingest Arguments---") - args = ingest_arguments() - - training( - train_filepath=args.train_filepath, - test_filepath=args.test_filepath, - target_column=args.target_column, - model_type=args.model_type, - ) diff --git a/model_data/temp_inputs.py b/model_data/temp_inputs.py deleted file mode 100644 index 18248734..00000000 --- a/model_data/temp_inputs.py +++ /dev/null @@ -1,35 +0,0 @@ -# Temporary input data -input_data = [ - { - "address1": "28 Distillery Wharf", - "postcode": "w6 9bf" - }, - { - "address1": "Flat 14 Godley V C House", - "postcode": "E2 0LP" - }, - { - "address1": "49 Elderfield Road", - "postcode": "E5 0LF" - }, - { - "address1": "26 Stanhope Road", - "postcode": "N6 5NG" - }, - { - "address1": "Flat 3 Frederick Building", - "postcode": "N1 4BD" - }, - { - "address1": "Flat 4 Frederick Building", - "postcode": "N1 4BD" - }, - { - "address1": "Flat 28, 22 Adelina Grove", - "postcode": "E1 3BX" - }, - { - "address1": "Flat 39, 239 Long Lane", - "postcode": "SE1 4PT" - } -] diff --git a/model_data/tests/test_sap_model.py b/model_data/tests/test_sap_model.py deleted file mode 100644 index 71d6369c..00000000 --- a/model_data/tests/test_sap_model.py +++ /dev/null @@ -1,37 +0,0 @@ -import pytest -from model_data.EpcClean import EpcClean -from model_data.analysis.SapModel import SapModel -from model_data.tests.test_data.sap_model_data import data - - -class TestSapModel: - - @pytest.fixture - def cleaner(self): - cleaner = EpcClean(data=data) - cleaner.clean() - return cleaner - - @pytest.fixture - def model(self, cleaner): - model = SapModel(data, cleaner=cleaner) - return model - - def test_run(self, model): - assert model.final_model is None - assert model.test_model is None - model.run() - - assert model.final_model is not None - assert model.test_model is not None - - # Note - this will potentially be different on different machines so may need to adjust these tests accordingly - # when running in CI/CD - assert (model.fit_error['Median Absolute Error'] - 1.7316860436422203) < 0.00001 - assert (model.predict_error['Median Absolute Error'] - 2.85481857667385) < 0.00001 - # final model doesn't do that well on this test data - assert (model.final_error['Median Absolute Error'] - 10.050349496213855) < 0.00001 - - - - diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index c46a495f..d99f794f 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -91,8 +91,8 @@ class WallRecommendations(Definitions): if self.property.walls["thermal_transmittance_unit"] != self.U_VALUE_UNIT: raise NotImplementedError("Haven't handled the case of other u value units yet") - # TODO: It's worth thinking about this logic because depending on when properties were built, - # they're likely to be of a certain standard. E.g. properties built within a certain time + # TODO: It's worth thinking about this logic because depending on when property_change were built, + # they're likely to be of a certain standard. E.g. property_change built within a certain time # period are likely to have cavity walls # We can't detect it's a cavity wall, but it was built after 1990 so likely built with insulation already diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py index cf036a1c..f58ec02e 100644 --- a/recommendations/tests/test_wall_recommendations.py +++ b/recommendations/tests/test_wall_recommendations.py @@ -230,7 +230,7 @@ class TestWallRecommendations: The important data for this recommendation is: - u value of 0.16 - property built in 2014 - Since properties built after 1990 are typically built with insulation and this property + Since property_change built after 1990 are typically built with insulation and this property already has really good insulation, we do NOT recommend any measures for this property """ input_properties[0].year_built = 2014 diff --git a/sapmodel.serverless.yml b/sapmodel.serverless.yml deleted file mode 100644 index 8e811a57..00000000 --- a/sapmodel.serverless.yml +++ /dev/null @@ -1,62 +0,0 @@ -service: sapmodel - -provider: - name: aws - region: eu-west-2 - architecture: x86_64 - environment: - RUNTIME_ENVIRONMENT: ${env:RUNTIME_ENVIRONMENT} - MODEL_DIRECTORY_BUCKET: ${env:MODEL_DIRECTORY_BUCKET} - PREDICTIONS_BUCKET: ${env:PREDICTIONS_BUCKET} - DATA_BUCKET: ${env:DATA_BUCKET} - DOMAIN_NAME: ${env:DOMAIN_NAME} - ECR_URI: ${env:ECR_URI} - GITHUB_SHA: ${env:GITHUB_SHA} - iam: - role: - name: sapmodel_s3_access - statements: - # Allow reading from MODEL_DIRECTORY_BUCKET and DATA_BUCKET - - Effect: Allow - Action: - - s3:* - # - s3:GetObject - # - s3:ListBucket - Resource: - - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET} - - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET}/* - - arn:aws:s3:::${env:DATA_BUCKET} - - arn:aws:s3:::${env:DATA_BUCKET}/* - # Allow reading and writing to PREDICTIONS_BUCKET - - Effect: Allow - Action: - - s3:* - # - s3:GetObject - # - s3:PutObject - # - s3:ListBucket - Resource: - - arn:aws:s3:::${env:PREDICTIONS_BUCKET} - - arn:aws:s3:::${env:PREDICTIONS_BUCKET}/* - - - -plugins: - - serverless-domain-manager - -custom: - customDomain: - domainName: api.${self:provider.environment.DOMAIN_NAME} - basePath: 'sapmodel' - createRoute53Record: true - certificateArn: ${ssm:/ssl_certificate_arn} - -functions: - sap_prediction_lambda: - image: - uri: ${env:ECR_URI}:${env:GITHUB_SHA} - # role: sapPredictionLambdaRole - events: - - http: - path: /predict - method: POST - timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed