diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index a77bcaa3..2494497d 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -56,8 +56,11 @@ construction_age_remap = { expanded_map = { i: [ - label for label, bounds in construction_age_bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l']) - ][0] for i in range(0, 3001) + label + for label, bounds in construction_age_bounds_map.items() + if (i <= bounds["u"]) and (i >= bounds["l"]) + ][0] + for i in range(0, 3001) } @@ -74,8 +77,13 @@ class EPCDataProcessor: Handle data loading and data preprocessing """ - def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None, - run_mode: str = "training", violation_mode: bool = False) -> None: + def __init__( + self, + data: pd.DataFrame | None = None, + cleaning_averages: pd.DataFrame | None = None, + run_mode: str = "training", + violation_mode: bool = False, + ) -> None: """ :param filepath: If specified, is the physical location of the data :param is_newdata: Indicates if we are processing new, testing data. @@ -86,7 +94,9 @@ class EPCDataProcessor: self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame() is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame) - self.cleaning_averages: pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame() + self.cleaning_averages: pd.DataFrame = ( + cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame() + ) # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA self.violation_mode = violation_mode @@ -103,7 +113,9 @@ class EPCDataProcessor: ignore_step = True if self.run_mode == "newdata" else False if filepath is not None: - self.load_data(filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]) + self.load_data( + filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"] + ) if len(self.data) == 0: raise Exception("No data to process - check filepath/ data being passed in") @@ -114,24 +126,53 @@ class EPCDataProcessor: self.remap_build_form() self.cast_data_column_values_to_lower() self.standardise_construction_age_band(ignore_step=ignore_step) - self.clean_missing_rooms(ignore_step=ignore_step) + + # TEST: Lets no impute any missing rooms to test + self.data = self.data[~self.data["NUMBER_HEATED_ROOMS"].isnull()] + self.data = self.data[~self.data["NUMBER_HABITABLE_ROOMS"].isnull()] + # self.clean_missing_rooms(ignore_step=ignore_step) + self.recast_df_columns( column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] ) - self.clean_multi_glaze_proportion(ignore_step=ignore_step) + + # TEST: Drop the cleaning of multi glaze proportion + self.data = self.data[~self.data["MULTI_GLAZE_PROPORTION"].isnull()] + # self.clean_multi_glaze_proportion(ignore_step=ignore_step) + + # TEST: drop the cleaning of photo supply - we lose a lot of data + # self.data = self.data[~self.data["PHOTO_SUPPLY"].isnull()] self.clean_photo_supply() + + # TEST: For the na_remapping, we can remove all nas before the retain_multiple_epc_properties step + for col in fill_na_map.keys(): + self.data = self.data[~self.data[col].isnull()] + + # TEST: Need to remove floor height, total floor area and FIXED_LIGHTING_OUTLETS_COUNT from the fill_na_fields + self.data = self.data[~self.data["FLOOR_HEIGHT"].isnull()] + self.data = self.data[~self.data["TOTAL_FLOOR_AREA"].isnull()] + self.data = self.data[~self.data["FIXED_LIGHTING_OUTLETS_COUNT"].isnull()] + self.retain_multiple_epc_properties( - epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], ignore_step=ignore_step + epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], + ignore_step=ignore_step, ) - self.fill_na_fields() + # TEST: Should be no need to fill na fields + # self.fill_na_fields() self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step) # Final re-casting after data transformed and prepared self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True) self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True) - self.na_remapping(auto_subset_columns=True) + + # TEST: Remove this step + # self.na_remapping(auto_subset_columns=True) + + if len(self.data) == 0: + self.cast_data_columns_to_lower() + return self.fill_invalid_constituency_fields(ignore_step=ignore_step) @@ -151,11 +192,13 @@ class EPCDataProcessor: if self.run_mode == "newdata": cleaning_averages.columns = cleaning_averages.columns.str.upper() - cleaned_data = self.apply_averages_cleaning( - data_to_clean=self.data, - cleaning_data=cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON, - ) + # TEST: Remove this step + cleaned_data = None + # cleaned_data = self.apply_averages_cleaning( + # data_to_clean=self.data, + # cleaning_data=cleaning_averages, + # cols_to_merge_on=COLUMNS_TO_MERGE_ON, + # ) self.data = self.data if cleaned_data is None else cleaned_data @@ -188,7 +231,9 @@ class EPCDataProcessor: if ignore_step: return - self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0] + self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[ + 0 + ] def fill_invalid_constituency_fields(self, ignore_step: bool = False): """ @@ -201,7 +246,9 @@ class EPCDataProcessor: if ignore_step: return - self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}) + self.data = self.data.fillna( + {"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]} + ) def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False): """ @@ -301,7 +348,7 @@ class EPCDataProcessor: """ if self.violation_mode: - # TODO: to fill in + # TODO: to fill in return if ignore_step: @@ -311,9 +358,7 @@ class EPCDataProcessor: lambda x: self.clean_construction_age_band(x) ) - self.data = self.data[ - ~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"]) - ] + self.data = self.data[~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])] def clean_missing_rooms(self, ignore_step: bool = False): """ @@ -331,31 +376,45 @@ class EPCDataProcessor: return # TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning) - self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0]) + self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply( + lambda x: x.split(" ")[0] + ) def apply_clean(data, matching_columns): - cleaning_data = data[~pd.isnull(data[col])].groupby( - matching_columns - )[col].median().reset_index() - - data = data.merge( - cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING") + cleaning_data = ( + data[~pd.isnull(data[col])] + .groupby(matching_columns)[col] + .median() + .reset_index() ) - data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col]) + data = data.merge( + cleaning_data, + how="left", + on=matching_columns, + suffixes=("", "_CLEANING"), + ) + + data[col] = np.where( + pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col] + ) data = data.drop(columns=f"{col}_CLEANING") return data for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]: to_index = 3 - matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA"] + matching_columns = [ + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "POSTAL_AREA", + ] has_missings = pd.isnull(self.data[col]).sum() while has_missings: self.data = apply_clean( - data=self.data, - matching_columns=matching_columns[0:to_index + 1] + data=self.data, matching_columns=matching_columns[0 : to_index + 1] ) has_missings = pd.isnull(self.data[col]).sum() @@ -363,7 +422,10 @@ class EPCDataProcessor: # Check if we've gotten to index 0 and still have missings - something has gone wrong or # we have a very unique property type if has_missings: - raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col) + raise NotImplementedError( + "Handle this edge case, we still have missings for column %s" + % col + ) break to_index -= 1 @@ -410,7 +472,7 @@ class EPCDataProcessor: # coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else # COLUMNTYPES # for k, v in coltypes.items(): - # self.data[k] = self.data[k].astype(v) + # self.data[k] = self.data[k].astype(v) # self.data = self.data.astype(coltypes) # self.na_remapping() @@ -437,9 +499,11 @@ class EPCDataProcessor: def na_remapping(self, auto_subset_columns: bool = False): - fill_na_map_apply = { - k: v for k, v in fill_na_map.items() if k in self.data.columns - } if auto_subset_columns else fill_na_map + fill_na_map_apply = ( + {k: v for k, v in fill_na_map.items() if k in self.data.columns} + if auto_subset_columns + else fill_na_map + ) for column, fill_value in fill_na_map_apply.items(): self.data[column] = self.data[column].fillna(fill_value) @@ -535,28 +599,34 @@ class EPCDataProcessor: for variable in AVERAGE_FIXED_FEATURES: # Replace any missing NAN values with averages for the same Property type and built form - cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna( - cleaning_averages_filled[f"{variable}_AVERAGE"] - ) + cleaning_averages_filled[variable] = cleaning_averages_filled[ + variable + ].fillna(cleaning_averages_filled[f"{variable}_AVERAGE"]) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_AVERAGE") + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=f"{variable}_AVERAGE" + ) # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope # and built form # We can use just the property type average and replace - cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna( - cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"] - ) + cleaning_averages_filled[variable] = cleaning_averages_filled[ + variable + ].fillna(cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_PROPERTY_AVERAGE") + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=f"{variable}_PROPERTY_AVERAGE" + ) # If there are still NA values, use BUILT FORM averages - cleaning_averages_filled["variable"] = cleaning_averages_filled[variable].fillna( - cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"] - ) + cleaning_averages_filled["variable"] = cleaning_averages_filled[ + variable + ].fillna(cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE") + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=f"{variable}_BUILT_FORM_AVERAGE" + ) # If there still is na values, use average across all epc in consituecy cleaning_averages_filled[variable] = cleaning_averages_filled[ @@ -573,7 +643,9 @@ class EPCDataProcessor: self.cleaning_averages = cleaning_averages_filled - def retain_multiple_epc_properties(self, epc_minimum_count: int = 1, ignore_step: bool = False) -> None: + def retain_multiple_epc_properties( + self, epc_minimum_count: int = 1, ignore_step: bool = False + ) -> None: """ Reduce the data futher by keeping only datasets with multiple epcs """ @@ -592,12 +664,16 @@ class EPCDataProcessor: counts = counts[counts["count"] > epc_minimum_count] self.data = pd.merge(self.data, counts, on="UPRN") - def recast_df_columns(self, column_mappings: dict, auto_subset_columns: bool = False) -> None: + def recast_df_columns( + self, column_mappings: dict, auto_subset_columns: bool = False + ) -> None: """ Recast columns from the dataframe to ensure the behaviour we want """ if auto_subset_columns: - column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns} + column_mappings = { + k: v for k, v in column_mappings.items() if k in self.data.columns + } for key, values in column_mappings.items(): if key not in self.data.columns: @@ -608,13 +684,17 @@ class EPCDataProcessor: else: self.data[key] = self.data[key].astype(values) - def recast_all_data(self, column_mappings: dict, auto_subset_columns: bool = False) -> None: + def recast_all_data( + self, column_mappings: dict, auto_subset_columns: bool = False + ) -> None: """ Using a dictionary to recast all columns at once """ if auto_subset_columns: - column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns} + column_mappings = { + k: v for k, v in column_mappings.items() if k in self.data.columns + } self.data = self.data.astype(column_mappings) @@ -625,14 +705,28 @@ class EPCDataProcessor: if self.violation_mode: violation_uprn_missing = pd.isnull(self.data["UPRN"]) - violation_old_lodgment_date = self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE - violation_invalid_transaction_type = self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES - violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS) + violation_old_lodgment_date = ( + self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE + ) + violation_invalid_transaction_type = ( + self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES + ) + violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin( + IGNORED_FLOOR_LEVELS + ) violation_rdsap_score_above_max = self.data[RDSAP_RESPONSE] > MAX_SAP_SCORE - violation_missing_windows_description = pd.isnull(self.data["WINDOWS_DESCRIPTION"]) - violation_missing_hotwater_description = pd.isnull(self.data["HOTWATER_DESCRIPTION"]) - violation_missing_roof_description = pd.isnull(self.data["ROOF_DESCRIPTION"]) - violation_invalid_property_type = self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES + violation_missing_windows_description = pd.isnull( + self.data["WINDOWS_DESCRIPTION"] + ) + violation_missing_hotwater_description = pd.isnull( + self.data["HOTWATER_DESCRIPTION"] + ) + violation_missing_roof_description = pd.isnull( + self.data["ROOF_DESCRIPTION"] + ) + violation_invalid_property_type = ( + self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES + ) violation_invalid_tenure = self.data["TENURE"].isin(IGNORED_TENURES) violation_df = pd.concat( @@ -647,7 +741,8 @@ class EPCDataProcessor: violation_missing_roof_description, violation_invalid_property_type, violation_invalid_tenure, - ], axis=1, + ], + axis=1, keys=[ "violation_uprn_missing", "violation_old_lodgment_date", @@ -658,8 +753,8 @@ class EPCDataProcessor: "violation_missing_hotwater_description", "violation_missing_roof_description", "violation_invalid_property_type", - "violation_invalid_tenure" - ] + "violation_invalid_tenure", + ], ) self.data = pd.concat([self.data, violation_df], axis=1) @@ -685,10 +780,10 @@ class EPCDataProcessor: self.data = self.data[~pd.isnull(self.data["UPRN"])] self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES] self.data = self.data[ - ~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS) + self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES ] + self.data = self.data[~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)] self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE] # We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them @@ -705,7 +800,7 @@ class EPCDataProcessor: self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)] # We remap zero values to None - self.data.loc[self.data['FLOOR_HEIGHT'] == 0, 'FLOOR_HEIGHT'] = None + self.data.loc[self.data["FLOOR_HEIGHT"] == 0, "FLOOR_HEIGHT"] = None def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None: """ @@ -734,7 +829,11 @@ class EPCDataProcessor: @staticmethod def apply_averages_cleaning( - data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False + data_to_clean, + cleaning_data, + cols_to_merge_on, + colnames=None, + ignore_step: bool = False, ): """ Clean the input DataFrame using averages from a cleaning DataFrame. @@ -752,12 +851,13 @@ class EPCDataProcessor: # The desired colnames to clean - which may not be present if colnames is None: - colnames = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"] + colnames = [ + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT", + "FIXED_LIGHTING_OUTLETS_COUNT", + ] - cols_to_clean = [ - c for c in colnames if - c in data_to_clean.columns - ] + cols_to_clean = [c for c in colnames if c in data_to_clean.columns] # Enforce data types for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]: @@ -768,7 +868,15 @@ class EPCDataProcessor: # Calculate averages cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg( - dict(zip(cols_to_clean, ["mean", ] * len(cols_to_clean))) + dict( + zip( + cols_to_clean, + [ + "mean", + ] + * len(cols_to_clean), + ) + ) ) # Merge with the original data @@ -777,7 +885,7 @@ class EPCDataProcessor: cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=("", "_AVERAGE"), - how='left' + how="left", ) global_averages = cleaning_data[cols_to_clean].mean() @@ -806,14 +914,20 @@ class EPCDataProcessor: raise Exception("Suffix should be one of _starting or _ending") if suffix == "_STARTING": - starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES].copy().add_suffix(suffix) + starting_cols = ( + self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES] + .copy() + .add_suffix(suffix) + ) fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy() return pd.concat([starting_cols, fixed_cols], axis=1) - return self.data[ - ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES - ].copy().add_suffix(suffix) + return ( + self.data[ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES] + .copy() + .add_suffix(suffix) + ) def get_fixed_features(self) -> pd.DataFrame: """ @@ -831,14 +945,17 @@ class EPCDataProcessor: :param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids :return: DataFrame with coerced columns. """ - object_columns = df.select_dtypes(include=['object']).columns + object_columns = df.select_dtypes(include=["object"]).columns if cols_to_ignore: object_columns = [c for c in object_columns if c not in cols_to_ignore] for column in object_columns: unique_values = df[column].dropna().unique() # If the unique values in the column are 'True' and 'False', convert the column to boolean - if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}: + if set(unique_values) == {"True", "False"} or set(unique_values) == { + True, + False, + }: df[column] = df[column].astype(bool) return df @@ -877,7 +994,6 @@ class EPCDataProcessor: @staticmethod def clean_efficiency_variables(df): - """ These is scope to clean this by the model per corresponding description. E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index 6abf05bd..665d9320 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -83,9 +83,9 @@ class EPCPipeline: run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", - epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_rooms.parquet", - epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_rooms.parquet", - epc_compiled_dataset_key="sap_change_model/{}/dataset_rooms.parquet", + epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_no_cleaning.parquet", + epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_no_cleaning.parquet", + epc_compiled_dataset_key="sap_change_model/{}/dataset_no_cleaning.parquet", use_parallel=False, ): """ @@ -237,6 +237,9 @@ class EPCPipeline: if difference_records is not None: constituency_difference_records.extend(difference_records) + if len(constituency_difference_records) == 0: + return + constituency_dataset = TrainingDataset( datasets=constituency_difference_records, cleaned_lookup=clean_lookup ) diff --git a/etl/epc/generate_scenarios_data.py b/etl/epc/generate_scenarios_data.py index d5bece8b..986410c8 100644 --- a/etl/epc/generate_scenarios_data.py +++ b/etl/epc/generate_scenarios_data.py @@ -54,8 +54,18 @@ scenario_properties = [ "postcode": "NN1 5JY", "lmk-key": "1459796789102016070507274146560098", "measures": [ - [["internal_wall_insulation"], "11", None, [0]], - [["external_wall_insulation"], "10", None, [0]], + [ + ["internal_wall_insulation"], + "11", + {"walls_insulation_thickness_ending": "average"}, + [0], + ], + [ + ["external_wall_insulation"], + "10", + {"walls_insulation_thickness_ending": "average"}, + [0], + ], [["solar", "windows"], "12-15", {"photo_supply_ending": 50}, [0, 1]], ], }, @@ -64,7 +74,12 @@ scenario_properties = [ "postcode": "HP1 2HA", "lmk-key": "c14029235739827d5f627dc8aa9bb567d026b267e851e0db0001db24638667b1", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -72,7 +87,12 @@ scenario_properties = [ "postcode": "HP1 2HE", "lmk-key": "99296a6dda21314fef3a61cda59e441e9a2aacf115eb96f4a0fa85696bf7b117", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -80,7 +100,12 @@ scenario_properties = [ "postcode": "HP1 2AN", "lmk-key": "d1e0534be3a44c33003323b21d0e322e3daddc65b5ee71936f89c59ddab96b50", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, { @@ -88,7 +113,12 @@ scenario_properties = [ "postcode": "HP1 2HX", "lmk-key": "1eae354db522a95188018d9cd0502ed8c609910b6c88f8797d3a25f59b11770a", "measures": [ - [["cavity_wall_insulation", "loft_insulation"], "15", None, [0, 1]], + [ + ["cavity_wall_insulation", "loft_insulation"], + "15", + {"walls_insulation_thickness_ending": "average"}, + [0, 1], + ], ], }, ] diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index c985567d..506c32b3 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -12,7 +12,7 @@ def main(): """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - # directories = directories[0:3] + # directories = directories[76:85] epc_pipeline = EPCPipeline( directories=directories,