From 6ddc9fddca86b30779e6da9870973e9f59fab180 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Oct 2023 12:21:21 +0800 Subject: [PATCH] debuggin sap model data prep --- backend/Property.py | 25 ++++++++++++++++++++----- backend/app/plan/router.py | 33 ++++++++++++++++++++++++++++----- etl/epc/DataProcessor.py | 19 +++++++++++++++++++ etl/epc/property_change_app.py | 17 +---------------- 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index be8ced9f..8d70ba8c 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -632,7 +632,6 @@ class Property(Definitions): 'PHOTO_SUPPLY', 'LOW_ENERGY_LIGHTING', 'SOLAR_WATER_HEATING_FLAG', - 'BUILT_FORM', 'GLAZED_TYPE', 'CONSTITUENCY', 'NUMBER_HEATED_ROOMS', @@ -642,6 +641,21 @@ class Property(Definitions): k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns } + built_form_cleaning_map = { + "Flat": "Mid-Terrace", + "House": "Semi-Detached", + "Bungalow": "Detached", + "Maisonette": "Mid-Terrace" + } + + built_form = self.data["built-form"] + if built_form in self.DATA_ANOMALY_MATCHES: + # TODO: If built form isn't captured, we use the most common value for that property type - we shall + # improve this methodology + built_form = built_form_cleaning_map.get(self.data["property-type"]) + if not built_form: + raise NotImplementedError("Not handled this property type when cleaning built form") + property_data = { **walls, **roof, @@ -653,15 +667,16 @@ class Property(Definitions): **windows, "SECONDHEAT_DESCRIPTION": second_heating, "DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]), - "SAP": self.data["current-energy-efficiency"], - "CARBON": self.data["co2-emissions-current"], - "HEAT_DEMAND": self.data["energy-consumption-current"], + "SAP": float(self.data["current-energy-efficiency"]), + "CARBON": float(self.data["co2-emissions-current"]), + "HEAT_DEMAND": float(self.data["energy-consumption-current"]), "estimated_perimeter": self.perimeter, "CONSTRUCTION_AGE_BAND": self.construction_age_band, "FLOOR_HEIGHT": self.floor_height, "NUMBER_HABITABLE_ROOMS": self.number_of_rooms, "TOTAL_FLOOR_AREA": self.floor_area, - **epc_raw_data + **epc_raw_data, + "BUILT_FORM": built_form, } return property_data diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 1c4261fd..4f6a7f10 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -157,13 +157,19 @@ async def trigger_plan(body: PlanTriggerRequest): data_processor = DataProcessor(None, newdata=True) data_processor.insert_data(pd.DataFrame([p.get_model_data()])) data_processor.pre_process() + data_processor.data = data_processor.clean_missings_after_description_process( + data_processor.data, [ + c for c in data_processor.data.columns if + ("thermal_transmittance" in c) or ("insulation_thickness" in c) + ] + ) starting_epc_data = data_processor.get_component_features(suffix="_STARTING") ending_epc_data = data_processor.get_component_features(suffix="_ENDING") fixed_data = data_processor.get_fixed_features() # We update the ending record with the recommended updates and we set lodgement date to today - ending_epc_data["LODGEMENT_DATE_ENDING"] = data_processor.calculate_days_to(created_at) + ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at) for recommendations_by_type in property_recommendations: for rec in recommendations_by_type: @@ -175,21 +181,38 @@ async def trigger_plan(body: PlanTriggerRequest): fixed_data=fixed_data, ) - fer - + none_cols = [] for col in scoring_dict.keys(): if col in [ "UPRN", "id", "LOCAL_AUTHORITY", ]: continue - if col in ["SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "FLOOR_HEIGHT_STARTING"]: - if scoring_dict[col]: + if col in [ + "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "FLOOR_HEIGHT_STARTING", + "TOTAL_FLOOR_AREA_STARTING", "DAYS_TO_STARTING", "estimated_perimeter_STARTING", + "SAP_ENDING", "HEAT_DEMAND_ENDING", + "CARBON_ENDING", "FLOOR_HEIGHT_ENDING", + "TOTAL_FLOOR_AREA_ENDING", "DAYS_TO_ENDING", "estimated_perimeter_ENDING" + ]: + try: + if scoring_dict[col] is None: + blah1 + float(scoring_dict[col]) + continue + except: + raise Exception("wtf") unique_vals = sap_change_dataset[col].unique() if scoring_dict[col] not in unique_vals: + if scoring_dict[col] is None: + none_cols.append(col) + continue blah + if none_cols: + blahblah + recommendations_scoring_data.append(scoring_dict) # cleanup diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index d0bb66de..cbbc68a7 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -550,3 +550,22 @@ class DataProcessor: return ( pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE) ).dt.days + + @staticmethod + def clean_missings_after_description_process(df, ignore_cols=None): + missings = pd.isnull(df).sum() + missings = missings[missings > 0] + + if ignore_cols: + missings = missings[~missings.index.isin(ignore_cols)] + + for col in missings.index: + unique_values = df[col].unique() + if True in unique_values or False in unique_values: + df[col] = df[col].fillna(False) + if "none" in unique_values: + df[col] = df[col].fillna("none") + else: + df[col] = df[col].fillna("Unknown") + + return df diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index c887e169..d7dce61c 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -363,21 +363,6 @@ def make_uvalues(df): return df -def clean_missings_after_description_process(df): - missings = pd.isnull(df).sum() - missings = missings[missings > 0] - for col in missings.index: - unique_values = df[col].unique() - if True in unique_values or False in unique_values: - df[col] = df[col].fillna(False) - if "none" in unique_values: - df[col] = df[col].fillna("none") - else: - df[col] = df[col].fillna("Unknown") - - return df - - def app(): # Get all the files in the directory @@ -544,7 +529,7 @@ def app(): # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't # need to - data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df) + data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df) if pd.isnull(data_by_urpn_df).sum().sum(): raise ValueError("Null values found in dataset after process_and_prune_desriptions")