diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py new file mode 100644 index 00000000..4b2202e8 --- /dev/null +++ b/model_data/simulation_system/DataProcessor.py @@ -0,0 +1,142 @@ +from pathlib import Path +import pandas as pd +from simulation_system.Settings import ( + DATA_PROCESSOR_SETTINGS, + EARLIEST_EPC_DATE, + FULLY_GLAZED_DESCRIPTIONS, + AVERAGE_FIXED_FEATURES, + FLOOR_HEIGHT_NATIONAL_AVERAGE, + TOTAL_FLOOR_AREA_NATIONAL_AVERAGE + ) + + +class DataProcessor: + """ + Handle data loading and data preprocessing + """ + + def __init__(self, filepath: Path) -> None: + self.filepath = filepath + + def load_data(self, low_memory=False) -> None: + self.data = pd.read_csv(self.filepath, low_memory=low_memory) + + def pre_process(self) -> pd.DataFrame: + """ + Load data and begin initial cleaning + """ + self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory']) + self.confine_data() + self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings']) + self.clean_multi_glaze_proportion() + self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count']) + + self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) + + return self.data + + def make_cleaning_averages(self) -> pd.DataFrame: + # Define a custom function to calculate the median, excluding missing values + def median_without_missing(group): + return group[AVERAGE_FIXED_FEATURES].median(skipna=True) + + cleaning_averages = self.data.groupby( + ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + observed=True + ).apply(median_without_missing).reset_index() + + general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply( + median_without_missing).reset_index() + + property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply( + median_without_missing).reset_index() + + built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply( + median_without_missing).reset_index() + + # We can clean up any NA's in the cleaning averages with the general averages here + cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE']) + cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE']) + cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE']) + + # Replace any missing NAN values with averages for the same Property type and built form + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) + + # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form + # We can use just the property type average and replace + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE']) + + # If there are still NA values, use BUILT FORM averages + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) + + # If there still is na values, use average across all properties in consituecy + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean()) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean()) + + # If the consituency is all NA values, then take UK AVERAGE VALUES + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE) + + return cleaning_averages_filled + + def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None: + ''' + Reduce the data futher by keeping only datasets with multiple epcs + ''' + + counts = self.data.groupby("UPRN").size().reset_index() + counts.columns = ["UPRN", "count"] + + # take UPRNS with multiple EPCs + counts = counts[counts["count"] > epc_minimum_count] + self.data = pd.merge(self.data, counts, on='UPRN') + + + def recast_df_columns(self, column_mappings: dict) -> None: + """ + Recast columns from the dataframe to ensure the behaviour we want + """ + + for key, values in column_mappings.items(): + if key not in self.data.columns: + print('Column mapping incorrectly specified') + exit(1) + for value in values: + self.data[key] = self.data[key].astype(value) + + + def confine_data(self) -> None: + """ + Include all step to reduce down the data based on assumptions + """ + + # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one + + # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged + # before the introduction of SAP09 + + # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with + # full SAP, which produces different results to the RdSAP methodology + + # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous + + self.data = self.data[~pd.isnull(self.data["UPRN"])] + self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"] + self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + + + def clean_multi_glaze_proportion(self) -> None: + """ + If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100 + """ + + no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 + diff --git a/model_data/simulation_system/Logger.py b/model_data/simulation_system/Logger.py new file mode 100644 index 00000000..5197e7ce --- /dev/null +++ b/model_data/simulation_system/Logger.py @@ -0,0 +1,22 @@ +import logging + +def setup_logger(): + # Create a logger + logger = logging.getLogger() + + # Set the log level + logger.setLevel(logging.INFO) + + # Create a formatter + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + + # Create a stream handler to direct logs to stdout + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(formatter) + + # Add the stream handler to the logger + logger.addHandler(stream_handler) + + return logger + +logger = setup_logger() \ No newline at end of file diff --git a/model_data/simulation_system/Settings.py b/model_data/simulation_system/Settings.py new file mode 100644 index 00000000..04e11c25 --- /dev/null +++ b/model_data/simulation_system/Settings.py @@ -0,0 +1,114 @@ +# Using a simply python file as settings for now +# TODO: migrate to dynaconf + +TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 +FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 + +FULLY_GLAZED_DESCRIPTIONS = [ + "Fully double glazed", + "High performance glazing", + "Fully triple glazed", + "Full secondary glazing", + "Multiple glazing throughout", +] + +FIXED_FEATURES = [ + 'PROPERTY_TYPE', + 'BUILT_FORM', + 'CONSTRUCTION_AGE_BAND', + 'NUMBER_HABITABLE_ROOMS', + 'CONSTITUENCY', + 'NUMBER_HEATED_ROOMS', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'FLOOR_HEIGHT', + 'FLOOR_LEVEL', + 'TOTAL_FLOOR_AREA', +] + +COMPONENT_FEATURES = [ + 'TRANSACTION_TYPE', + 'WALLS_DESCRIPTION', + 'FLOOR_DESCRIPTION', + 'LIGHTING_DESCRIPTION', + 'ROOF_DESCRIPTION', + 'MAINHEAT_DESCRIPTION', + 'HOTWATER_DESCRIPTION', + 'MAIN_FUEL', + 'MECHANICAL_VENTILATION', + 'SECONDHEAT_DESCRIPTION', + 'ENERGY_TARIFF', # Not sure if this is relevant + 'SOLAR_WATER_HEATING_FLAG', + 'PHOTO_SUPPLY', + 'WINDOWS_DESCRIPTION', + 'GLAZED_TYPE', + 'MULTI_GLAZE_PROPORTION', + 'LIGHTING_DESCRIPTION', + 'LOW_ENERGY_LIGHTING', + 'NUMBER_OPEN_FIREPLACES', + 'MAINHEATCONT_DESCRIPTION', + 'EXTENSION_COUNT', + # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION +] + +# For these fields, we take an average if we have multiple values +AVERAGE_FIXED_FEATURES = [ + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT" +] + +# For these fields, we take the latest value if we have multiple values +# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is +# the most accurate +LATEST_FIELD = [ + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "FLOOR_LEVEL", + "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for +] + +# If we see thee features changing, we don't use the EPC, since deem it not to be reliable +MANDATORY_FIXED_FEATURES = [ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTITUENCY" +] + +# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were +# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England +# and Wales from 31 July 2014 +EARLIEST_EPC_DATE = "2014-08-01" + +RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" +HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" + +def ordinal(n): + if 10 <= n % 100 <= 20: + suffix = 'th' + else: + suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') + + return str(n) + suffix + +FLOOR_LEVEL_MAP = { + "Basement": -1, + "Ground": 0, + "ground floor": 0, + "20+": 20, + "21st or above": 21, + **{str(i).zfill(2): i for i in range(0, 21)}, + **{ordinal(i): i for i in range(-1, 21)}, + **{str(i): i for i in range(-1, 21)}, + **{i: i for i in range(-1, 21)}, +} + +BUILT_FORM_REMAP = { + "Enclosed End-Terrace": "End-Terrace", + "Enclosed Mid-Terrace": "Mid-Terrace", +} + +DATA_PROCESSOR_SETTINGS = { + 'low_memory': False, + 'epc_minimum_count': 1, + 'column_mappings': {'UPRN': [int, str]} +} \ No newline at end of file diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index 5e982211..e1ab4c97 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -1,171 +1,21 @@ import numpy as np -import os import pandas as pd from tqdm import tqdm from model_data.BaseUtility import BaseUtility +from pathlib import Path +from model_data.simulation_system.Settings import ( + MANDATORY_FIXED_FEATURES, + AVERAGE_FIXED_FEATURES, + LATEST_FIELD, + COMPONENT_FEATURES, + RDSAP_RESPONSE, + HEAT_DEMAND_RESPONSE, + FLOOR_LEVEL_MAP, + BUILT_FORM_REMAP +) +from DataProcessor import DataProcessor - -def list_subdirectories(directory_path): - return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))] - - -DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates' - -FIXED_FEATURES = [ - 'PROPERTY_TYPE', - 'BUILT_FORM', - 'CONSTRUCTION_AGE_BAND', - 'NUMBER_HABITABLE_ROOMS', - 'CONSTITUENCY', - 'NUMBER_HEATED_ROOMS', - 'FIXED_LIGHTING_OUTLETS_COUNT', - 'FLOOR_HEIGHT', - 'FLOOR_LEVEL', - 'TOTAL_FLOOR_AREA', -] - -COMPONENT_FEATURES = [ - 'TRANSACTION_TYPE', - 'WALLS_DESCRIPTION', - 'FLOOR_DESCRIPTION', - 'LIGHTING_DESCRIPTION', - 'ROOF_DESCRIPTION', - 'MAINHEAT_DESCRIPTION', - 'HOTWATER_DESCRIPTION', - 'MAIN_FUEL', - 'MECHANICAL_VENTILATION', - 'SECONDHEAT_DESCRIPTION', - 'ENERGY_TARIFF', # Not sure if this is relevant - 'SOLAR_WATER_HEATING_FLAG', - 'PHOTO_SUPPLY', - 'WINDOWS_DESCRIPTION', - 'GLAZED_TYPE', - 'MULTI_GLAZE_PROPORTION', - 'LIGHTING_DESCRIPTION', - 'LOW_ENERGY_LIGHTING', - 'NUMBER_OPEN_FIREPLACES', - 'MAINHEATCONT_DESCRIPTION', - 'EXTENSION_COUNT', - # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION -] - -# For these fields, we take an average if we have multiple values -AVERAGE_FIXED_FEATURES = [ - "TOTAL_FLOOR_AREA", - "FLOOR_HEIGHT" -] - -# For these fields, we take the latest value if we have multiple values -# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is -# the most accurate -LATEST_FIELD = [ - "NUMBER_HABITABLE_ROOMS", - "NUMBER_HEATED_ROOMS", - "FIXED_LIGHTING_OUTLETS_COUNT", - "CONSTRUCTION_AGE_BAND", - "FLOOR_LEVEL", - "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for -] - -# If we see thee features changing, we don't use the EPC, since deem it not to be reliable -MANDATORY_FIXED_FEATURES = [ - "PROPERTY_TYPE", - "BUILT_FORM", - "CONSTITUENCY" -] - -# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were -# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England -# and Wales from 31 July 2014 -EARLIEST_EPC_DATE = "2014-08-01" - -RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" -HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" - - -def make_cleaning_averages(df): - # Define a custom function to calculate the median, excluding missing values - def median_without_missing(group): - return group[AVERAGE_FIXED_FEATURES].median(skipna=True) - - cleaning_averages = df.groupby( - ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], - observed=True - ).apply(median_without_missing).reset_index() - - general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply( - median_without_missing).reset_index() - - return cleaning_averages, general_averages - - -def iterative_filtering(cleaning_averages, property_data): - # Define the columns to filter on - columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", - "NUMBER_HEATED_ROOMS"] - - # Start with the entire cleaning_averages DataFrame - filtered_data = cleaning_averages.copy() - - # Iterate through the columns and apply filters one by one - for column in columns_to_filter: - # Apply the filter using the value from property_data - new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]] - - # If the filter results in no data, return the previous result - if new_filtered_data.empty: - continue - - # If the filter is successful, update the filtered data - filtered_data = new_filtered_data - - return filtered_data - - -def clean_multi_glaze_proportion(df): - fully_glazed_descriptions = [ - "Fully double glazed", - "High performance glazing", - "Fully triple glazed", - "Full secondary glazing", - "Multiple glazing throughout", - ] - - df["MULTI_GLAZE_PROPORTION"] = np.where( - pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)), - 100, - df["MULTI_GLAZE_PROPORTION"], - ) - - return df - - -def ordinal(n): - if 10 <= n % 100 <= 20: - suffix = 'th' - else: - suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') - - return str(n) + suffix - - -FLOOR_LEVEL_MAP = { - "Basement": -1, - "Ground": 0, - "ground floor": 0, - "20+": 20, - "21st or above": 21, - **{str(i).zfill(2): i for i in range(0, 21)}, - **{ordinal(i): i for i in range(-1, 21)}, - **{str(i): i for i in range(-1, 21)}, - **{i: i for i in range(-1, 21)}, -} - -BUILT_FORM_REMAP = { - "Enclosed End-Terrace": "End-Terrace", - "Enclosed Mid-Terrace": "Mid-Terrace", -} - +DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' def app(): # Get all the files in the directory @@ -173,108 +23,98 @@ def app(): # Data glossary: # https://epc.opendatacommunities.org/docs/guidance#glossary - directories = list_subdirectories(DATA_DIRECTORY) + # List all subdirectories + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] dataset = [] - for directory in tqdm(directories): - filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") - df = pd.read_csv(filepath, low_memory=False) - # UPRN is a unique identifier for a property, so we remove any EPCs that don't have one - df = df[~pd.isnull(df["UPRN"])] - # Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged - # before the introduction of SAP09 - df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - cleaning_averages, general_averages = make_cleaning_averages(df) + for directory in tqdm(directories): - # We remove EPCS that were conducted for a new build, since these are performed with - # full SAP, which produces different results to the RdSAP methodology - df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + filepath = directory / "certificates.csv" - df = clean_multi_glaze_proportion(df) + data_processor = DataProcessor(filepath=filepath) - # We remove floor level in top floor or mid floor since this is ambiguous - df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] - - df["UPRN"] = df["UPRN"].astype(int).astype(str) - counts = df.groupby("UPRN").size().reset_index() - counts.columns = ["UPRN", "count"] - counts = counts.sort_values("count", ascending=False) - - # take UPRNS with multiple EPCs - counts = counts[counts["count"] > 1] - df = df[df["UPRN"].isin(counts["UPRN"])] - df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) + df = data_processor.pre_process() + cleaning_averages = data_processor.make_cleaning_averages() for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time - - ignore_epc = False fixed_data = {} - for field in FIXED_FEATURES: - vals = property_data[field].dropna().unique() - # Remove invalid values - vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES] - if field == "FLOOR_LEVEL": - vals = list({FLOOR_LEVEL_MAP[v] for v in vals}) + # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row + if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: + continue - if field == "BUILT_FORM": - vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals}) + # Map all anomaly values to None + data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) + + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values + modified_property_data = property_data.replace(data_anomaly_map) + modified_property_data = modified_property_data.replace(np.NAN, None) + + # Remap certain columns + modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) + modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP) - if field in AVERAGE_FIXED_FEATURES: + # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS + latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict() + mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() - if len(vals) > 1: - # Check the values are too far apart - if abs(vals[0] - vals[1]) / vals[0] > 0.1: - # Take the more recent value since it's likely to be more accurate - vals = [vals[-1]] + # Taking just the last row, which is the percentage change from the latest to previous one only + # modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 - if vals: - field_value = np.mean(vals) - else: - # Clean using averages + # We can replace any NA values for Average fixed features + # We have columns that we want to merge on, but some of these columns are all NA values + # So we determine which columns to merge on, and get the equivalent grouping in the averages + columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS"] + + if any(modified_property_data[columns_to_merge_on].isna()): + # If there are any NA value, back fill first (i.e most recent), then forward fill if needed + modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill') + + # Extract the columns that are not all None + na_columns = modified_property_data[columns_to_merge_on].isna().all() + columns_to_merge_on = na_columns.index[~na_columns].to_list() - avgs = iterative_filtering(cleaning_averages, property_data) - # TODO: Should probably do a mean/median? - field_value = avgs[field].iloc[0] + # Get the corresponding groupby and merge, and fill in NA values + cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() + modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE']) + modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) + modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE']) + modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) - if pd.isnull(field_value): - # Just the use the general averages - field_value = general_averages[ - (general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) & - (general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) - ][field].iloc[0] - - elif field in LATEST_FIELD: - field_value = vals[-1] if vals else None - else: - if len(vals) > 1: - if field in MANDATORY_FIXED_FEATURES: - ignore_epc = True - else: - raise ValueError("Fixed feature {} has more than one value - fix me".format(field)) - - field_value = vals[0] if vals else None + for field in AVERAGE_FIXED_FEATURES: + vals = list(modified_property_data[field].dropna().unique()) + if len(vals) > 1: + # Check the values are too far apart + # TODO: we could have multiple values here, why only use the first two? + if abs(vals[0] - vals[1]) / vals[0] > 0.1: + # Take the more recent value since it's likely to be more accurate + vals = [vals[-1]] + if vals: + field_value = np.mean(vals) + fixed_data[field] = field_value - if ignore_epc: - continue + #Combine all fields together + fixed_data.update(mandatory_field_data) + fixed_data.update(latest_field_data) # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = property_data[ + variable_data = modified_property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] - for idx in range(0, property_data.shape[0] - 1): + for idx in range(0, modified_property_data.shape[0] - 1): - if idx >= property_data.shape[0] - 1: + if idx >= modified_property_data.shape[0] - 1: break starting_record = variable_data.iloc[idx] @@ -307,3 +147,10 @@ def app(): ) dataset.extend(property_model_data) + + output = pd.DataFrame(dataset) + output.to_parquet('./dataset.parquet') + + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py new file mode 100644 index 00000000..4a361196 --- /dev/null +++ b/model_data/simulation_system/energy_predictor.py @@ -0,0 +1,117 @@ +from pathlib import Path +from Settings import ( + RDSAP_RESPONSE, + FLOOR_LEVEL_MAP, + BUILT_FORM_REMAP, + EARLIEST_EPC_DATE, + FULLY_GLAZED_DESCRIPTIONS, + FIXED_FEATURES, + LATEST_FIELD, + COMPONENT_FEATURES + ) +from model_data.BaseUtility import BaseUtility +from tqdm import tqdm +import pandas as pd +import numpy as np + +from autogluon.tabular import TabularDataset, TabularPredictor + +RANDOM_SEED = 0 + +DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' + +FLOAT_COLUMNS = [ + 'NUMBER_OPEN_FIREPLACES', + 'EXTENSION_COUNT', + 'TOTAL_FLOOR_AREA', + 'PHOTO_SUPPLY', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'FLOOR_HEIGHT', + 'NUMBER_HABITABLE_ROOMS', + 'LOW_ENERGY_LIGHTING', + 'MULTI_GLAZE_PROPORTION', + 'NUMBER_HEATED_ROOMS' + ] + +def create_raw_data(): + """ + Extract all information to do a simple predictor for RDSAP + """ + + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + # directories = directories[0:10] + dfs = [] + for directory in tqdm(directories): + filepath = directory / "certificates.csv" + df = pd.read_csv(filepath, low_memory=False) + + # Remove any bad uprns and ignore old/bad data + df = df[~pd.isnull(df["UPRN"])] + df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + + # Change multi glaze proportion + no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 + + # Recast + df["UPRN"] = df["UPRN"].astype(int).astype(str) + df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float) + + # Sort Data + df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) + + # Map all anomaly values to None + data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) + + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values + df = df.replace(data_anomaly_map) + df = df.replace(np.NAN, None) + + # Remap certain columns + df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) + df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) + + # Keep only possible modelling columns + df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))] + + # Reduce memory usage + + # df.memory_usage() + # df.dtypes + df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned') + df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float') + + + dfs.append(df) + + data = pd.concat(dfs) + data.to_parquet('./energy_predictor_data.parquet') + + cleaned_data = data.dropna() + # GIves you primarily flats + cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet') + + +def main(): + + data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet') + + subsample_size = round(len(data)/100) + data = data.sample(subsample_size, random_state=RANDOM_SEED) + + predictor_RDSAP = TabularPredictor( + label=RDSAP_RESPONSE, + path="agModels-predictENERGY", + problem_type="regression", + eval_metric='mean_absolute_error' + ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) + + test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet') + performance = predictor_RDSAP.evaluate(test_data) + predictions = predictor_RDSAP.predict(test_data) + predictor_RDSAP.feature_importance(test_data) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/model_data/simulation_system/preprocessed_data/dataset.parquet b/model_data/simulation_system/preprocessed_data/dataset.parquet new file mode 100644 index 00000000..4b6247d6 Binary files /dev/null and b/model_data/simulation_system/preprocessed_data/dataset.parquet differ diff --git a/model_data/simulation_system/test_data_generation.py b/model_data/simulation_system/test_data_generation.py new file mode 100644 index 00000000..fb7d7c64 --- /dev/null +++ b/model_data/simulation_system/test_data_generation.py @@ -0,0 +1,77 @@ +from Logger import logger +import argparse +import pandas as pd +from pathlib import Path + +RANDOM_SEED = 0 + +def ingest_arguments() -> argparse.Namespace: + """ + Helper function to take in arguments from script start + """ + + parser = argparse.ArgumentParser(description='Inputs for training script') + + parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True) + parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True) + parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None) + parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None) + parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random') + + args = parser.parse_args() + + return args + +def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str): + """ + Load a dataset in and split out the training+validation data and the test data. + """ + + logger.info('---Loading Data---') + data = pd.read_parquet(filepath).reset_index(drop=True) + + if percentage and volume is None: + test_amount = round(len(data)*percentage) + elif percentage is None and volume: + test_amount = volume + elif percentage is None and volume is None: + logger.error('No amount specified - please specify either a percentage or volume') + exit(1) + else: + logger.info('Both percentage and volume specified - taking largest of the two') + test_amount = max(round(len(data)*percentage), volume) + + logger.info(f'---Extracting {test_amount} from dataset to be test data') + + if sampling == 'random': + logger.info('--- Using random sample method ---') + sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index + + train_validation_data = data.drop(sample_index) + test_data = data.iloc[sample_index] + + elif sampling =='stratified': + # Not yet implemented + pass + + logger.info('--- Saving data ---') + + train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet') + test_data.to_parquet(Path(output_folder)/'test_data.parquet') + + logger.info(' ---Pipeline complete---') + +if __name__ == "__main__": + + logger.info('--- Generate test data pipeline ---') + + args = ingest_arguments() + + main( + filepath=args.filepath, + output_folder=args.output_folder, + percentage=args.percentage, + volume=args.volume, + sampling=args.sampling + ) + diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py new file mode 100644 index 00000000..cde310a3 --- /dev/null +++ b/model_data/simulation_system/training.py @@ -0,0 +1,131 @@ +import os +import pandas as pd +import argparse +from typing import List +from Logger import logger +from autogluon.tabular import TabularDataset, TabularPredictor + + +DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] +FEATURE_COLUMNS = None +RANDOM_SEED = 0 + +# FOR TESTING +train_filepath = "./model_build_data/train_validation_data.parquet" +test_filepath = "./model_build_data/test_data.parquet" + + +def ingest_arguments() -> argparse.Namespace: + """ + Helper function to take in arguments from script start + """ + + parser = argparse.ArgumentParser(description='Inputs for training script') + + parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training') + parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing') + + args = parser.parse_args() + + return args + + +class DataLoader(): + + @staticmethod + def load(filepath: str) -> pd.DataFrame: + """ + Load different datasets + """ + if filepath.endswith('.parquet'): + df = pd.read_parquet(filepath) + elif filepath.endswith('.csv.'): + df = pd.read_csv(filepath) + else: + logger.error('Not implemented!') + exit(1) + + return df + +class FeatureProcessor: + """ + Handle all feature manipulation before modelling + """ + + @staticmethod + def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame: + df = df.drop(columns=[drop_columns]) + return df + + def retain_features(df: pd.DataFrame, features: List[str] = None): + """ + Determine which columns to keep ofr modelling + """ + if features is None: + features = df.columns + else: + if not set(features).issubset(df.columns): + logger.error('Features defined is not contained in data') + exit(1) + + df = df[features] + + return df + + def process(self, df: pd.DataFrame) -> pd.DataFrame: + df = self.drop_columns(df, drop_columns=DROP_COLUMNS) + df = self.retain_features(df, features=FEATURE_COLUMNS) + return df + + + +def training(train_filepath: str, test_filepath: str) -> None: + """ + Pipeline to run training on the dataset + """ + + logger.info('Loading data') + dataloader = DataLoader() + train_df = dataloader.load(filepath=train_filepath) + test_df = dataloader.load(filepath=test_filepath) + + # df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE']) + + logger.info('Feature processing') + feature_processor = FeatureProcessor() + train_df = feature_processor.process(train_df) + test_df = feature_processor.process(test_df) + + # logger.info('Split data into train and validation') + + logger.info('Build Model') + data = TabularDataset(data=train_df) + # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) + subsample_size = round(len(data)/4) + data = data.sample(subsample_size, random_state=RANDOM_SEED) + + target_column = 'RDSAP_CHANGE' + predictor_RDSAP = TabularPredictor( + label=target_column, + path="agModels-predictRDSAP", + problem_type="regression", + eval_metric='mean_absolute_error' + ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN']) + + logger.info('Evaluate matrics') + + test_data = TabularDataset('./model_build_data/test_data.parquet') + performance = predictor_RDSAP.evaluate(test_data) + predictions = predictor_RDSAP.predict(test_data) + + test_data['predictions'] = predictions + test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions']) + +if __name__ == "__main__": + + logger.info('---Begin Pipeline---') + + logger.info('---Ingest Arguments---') + args = ingest_arguments() + + training(train_filepath=args.train_filepath, test_filepath=args.test_filepath) \ No newline at end of file