diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index d846dcf7..c11e7ae7 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -19,7 +19,6 @@ FIXED_FEATURES = [ 'CONSTITUENCY', 'NUMBER_HEATED_ROOMS', 'FIXED_LIGHTING_OUTLETS_COUNT', - 'GLAZED_AREA', 'FLOOR_HEIGHT', 'FLOOR_LEVEL', 'TOTAL_FLOOR_AREA', @@ -46,7 +45,8 @@ COMPONENT_FEATURES = [ 'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'MAINHEATCONT_DESCRIPTION', - 'EXTENSION_COUNT' + 'EXTENSION_COUNT', + # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION ] # For these fields, we take an average if we have multiple values @@ -62,12 +62,15 @@ LATEST_FIELD = [ "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", - "CONSTRUCTION_AGE_BAND" + "CONSTRUCTION_AGE_BAND", + "FLOOR_LEVEL", + "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for ] # If we see thee features changing, we don't use the EPC, since deem it not to be reliable MANDATORY_FIXED_FEATURES = [ "PROPERTY_TYPE", + "BUILT_FORM", ] # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were @@ -80,11 +83,60 @@ HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" def make_cleaning_averages(df): - cleaning_averages = df.groupby( - ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"] - )[AVERAGE_FIXED_FEATURES].mean().reset_index() + # Define a custom function to calculate the median, excluding missing values + def median_without_missing(group): + return group[AVERAGE_FIXED_FEATURES].dropna().median() - return cleaning_averages + cleaning_averages = df.groupby( + ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + observed=True + ).apply(median_without_missing).reset_index() + + general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply( + median_without_missing).reset_index() + + return cleaning_averages, general_averages + + +def iterative_filtering(cleaning_averages, property_data): + # Define the columns to filter on + columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS"] + + # Start with the entire cleaning_averages DataFrame + filtered_data = cleaning_averages.copy() + + # Iterate through the columns and apply filters one by one + for column in columns_to_filter: + # Apply the filter using the value from property_data + new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]] + + # If the filter results in no data, return the previous result + if new_filtered_data.empty: + continue + + # If the filter is successful, update the filtered data + filtered_data = new_filtered_data + + return filtered_data + + +def clean_multi_glaze_proportion(df): + fully_glazed_descriptions = [ + "Fully double glazed", + "High performance glazing", + "Fully triple glazed", + "Full secondary glazing", + "Multiple glazing throughout", + ] + + df["MULTI_GLAZE_PROPORTION"] = np.where( + pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)), + 100, + df["MULTI_GLAZE_PROPORTION"], + ) + + return df FLOOR_LEVEL_MAP = { @@ -97,6 +149,15 @@ FLOOR_LEVEL_MAP = { "1st": 1, "2nd": 2, "3rd": 3, + "ground floor": 0, + # Put in estimates for these + "mid floor": 2, + "top floor": 4 +} + +BUILT_FORM_REMAP = { + "Enclosed End-Terrace": "End-Terrace", + "Enclosed Mid-Terrace": "Mid-Terrace", } @@ -105,6 +166,7 @@ def app(): directories = list_subdirectories(DATA_DIRECTORY) + dataset = [] for directory in tqdm(directories): filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") df = pd.read_csv(filepath, low_memory=False) @@ -112,12 +174,14 @@ def app(): df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - cleaning_averages = make_cleaning_averages(df) + cleaning_averages, general_averages = make_cleaning_averages(df) # We remove EPCS that were conducted for a new build, since these are performed with # full SAP, which produces different results to the RdSAP methodology df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + df = clean_multi_glaze_proportion(df) + df["UPRN"] = df["UPRN"].astype(int).astype(str) counts = df.groupby("UPRN").size().reset_index() counts.columns = ["UPRN", "count"] @@ -128,7 +192,6 @@ def app(): df = df[df["UPRN"].isin(counts["UPRN"])] df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) - results = [] for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time @@ -143,6 +206,9 @@ def app(): if field == "FLOOR_LEVEL": vals = list({FLOOR_LEVEL_MAP[v] for v in vals}) + if field == "BUILT_FORM": + vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals}) + if field in AVERAGE_FIXED_FEATURES: if len(vals) > 1: @@ -155,18 +221,17 @@ def app(): field_value = np.mean(vals) else: # Clean using averages - avgs = cleaning_averages[ - (cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) & - (cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) & - (cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[ - 0]) & - (cleaning_averages["NUMBER_HABITABLE_ROOMS"] == - property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) & - (cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0]) - ] + avgs = iterative_filtering(cleaning_averages, property_data) field_value = avgs[field].iloc[0] + if pd.isnull(field_value): + # Just the use the general averages + field_value = general_averages[ + (general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) & + (general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) + ][field].iloc[0] + elif field in LATEST_FIELD: field_value = vals[-1] if vals else None else: @@ -220,8 +285,9 @@ def app(): "UPRN": uprn, "RDSAP_CHANGE": rdsap_change, "HEAT_DEMAND_CHANGE": heat_demand_change, + **fixed_data, **features.to_dict() } ) - results.extend(property_model_data) + dataset.extend(property_model_data)