diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index bb6f80bf..e1ab4c97 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -3,7 +3,7 @@ import pandas as pd from tqdm import tqdm from model_data.BaseUtility import BaseUtility from pathlib import Path -from simulation_system.Settings import ( +from model_data.simulation_system.Settings import ( MANDATORY_FIXED_FEATURES, AVERAGE_FIXED_FEATURES, LATEST_FIELD, @@ -28,7 +28,6 @@ def app(): dataset = [] - for directory in tqdm(directories): filepath = directory / "certificates.csv" @@ -43,16 +42,16 @@ def app(): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} + # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row + if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: + continue + # Map all anomaly values to None data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values modified_property_data = property_data.replace(data_anomaly_map) modified_property_data = modified_property_data.replace(np.NAN, None) - - # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row - if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: - continue # Remap certain columns modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py new file mode 100644 index 00000000..8e2a7e25 --- /dev/null +++ b/model_data/simulation_system/energy_predictor.py @@ -0,0 +1,63 @@ +from pathlib import Path +from Settings import ( + RDSAP_RESPONSE, + FLOOR_LEVEL_MAP, + BUILT_FORM_REMAP, + EARLIEST_EPC_DATE, + FULLY_GLAZED_DESCRIPTIONS + ) +from model_data.BaseUtility import BaseUtility +from tqdm import tqdm +import pandas as pd +import numpy as np + +DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' + +def main(): + """ + Extract all information to do a simple predictor for RDSAP + """ + + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + directories = directories[0:10] + dfs = [] + for directory in tqdm(directories): + filepath = directory / "certificates.csv" + df = pd.read_csv(filepath) + + # Remove any bad uprns and ignore old/bad data + df = df[~pd.isnull(df["UPRN"])] + df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] + df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + + # Change multi glaze proportion + no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 + + # Recast + df["UPRN"] = df["UPRN"].astype(int).astype(str) + df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float) + + # Sort Data + df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) + + # Map all anomaly values to None + data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) + + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values + df = df.replace(data_anomaly_map) + df = df.replace(np.NAN, None) + + # Remap certain columns + df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) + df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) + + dfs.append(df) + + data = pd.concat(dfs) + data.to_parquet('./energy_predictor_data.parquet') + + +if __name__ == "__main__": + main() \ No newline at end of file