diff --git a/backend/Property.py b/backend/Property.py index 259ca724..370eca06 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -749,6 +749,7 @@ class Property(Definitions): "TOTAL_FLOOR_AREA": self.floor_area, **epc_raw_data, "BUILT_FORM": built_form, + "POSTCODE": self.data["postcode"], } return property_data diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index e531896e..4064452f 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -211,11 +211,18 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Preparing data for scoring in sap change api") recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) - # Perform the same cleaning as in the model + # Perform the same cleaning as in the model - first clean number of room variables though recommendations_scoring_data = DataProcessor.apply_averages_cleaning( data_to_clean=recommendations_scoring_data, cleaning_data=cleaning_data, - cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"] + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + recommendations_scoring_data = DataProcessor.apply_averages_cleaning( + data_to_clean=recommendations_scoring_data, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], ).drop(columns=["LOCAL_AUTHORITY"]) recommendations_scoring_data = DataProcessor.clean_missings_after_description_process( @@ -303,7 +310,7 @@ async def trigger_plan(body: PlanTriggerRequest): # 3) the recommendations logger.info("Uploading recommendations to the database") - for i in tqdm(range(0, len(input_properties), BATCH_SIZE)): + for i in range(0, len(input_properties), BATCH_SIZE): try: # Take a slice of the input_properties list to make a batch batch_properties = input_properties[i:i + BATCH_SIZE] diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 3ef485b8..0587fdbe 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -179,7 +179,6 @@ class DataProcessor: # We have some non-standard construction age bands which we'll clean for matching if not self.newdata: self.standardise_construction_age_band() - self.clean_missing_rooms() self.recast_df_columns( @@ -451,7 +450,7 @@ class DataProcessor: self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0) @staticmethod - def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on): + def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None): """ Clean the input DataFrame using averages from a cleaning DataFrame. @@ -459,11 +458,16 @@ class DataProcessor: :param cleaning_data: DataFrame containing data for cleaning. :param cols_to_merge_on: Columns on which merging is based. We pass cols_to_merge_on to this function as this differs depending on where the function is being used. + :param colnames: If specified can be used to state exactly which columns to clean :return: Cleaned DataFrame. """ + # The desired colnames to clean - which may not be present + if colnames is None: + colnames = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"] + cols_to_clean = [ - c for c in ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"] if + c for c in colnames if c in data_to_clean.columns ] @@ -492,6 +496,8 @@ class DataProcessor: for col in cols_to_clean: data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"], inplace=True) data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True) + # If we still have missings + data_to_clean[col].fillna(data_to_clean[col].mean(), inplace=True) return data_to_clean