From 0c1ce64789938b97a0dfb687e3fda9dab0e5504d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 18 Jan 2024 14:32:24 +0000 Subject: [PATCH 01/48] removed temp code and fixed bug where cleaning data is lower case in newdata mode --- backend/app/plan/router.py | 22 ++++------- etl/epc/DataProcessor.py | 81 ++++++++++++++++++++------------------ 2 files changed, 50 insertions(+), 53 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 8c199145..b3d1c623 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -28,8 +28,6 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, sap_to_e from backend.ml_models.api import ModelApi from backend.Property import Property -from etl.epc.DataProcessor import EPCDataProcessor -from etl.epc.settings import COLUMNS_TO_MERGE_ON from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.optimiser.CostOptimiser import CostOptimiser @@ -68,7 +66,6 @@ async def trigger_plan(body: PlanTriggerRequest): ) input_properties = [] - for config in plan_input: # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -96,13 +93,16 @@ async def trigger_plan(body: PlanTriggerRequest): ) epc_records = { - 'original_epc': epc_searcher.newest_epc, - 'full_sap_epc': epc_searcher.full_sap_epc, - 'old_data': epc_searcher.older_epcs, + 'original_epc': epc_searcher.newest_epc.copy(), + 'full_sap_epc': epc_searcher.full_sap_epc.copy(), + 'old_data': epc_searcher.older_epcs.copy(), } - prepared_epc = EPCRecord(epc_records=epc_records, run_mode="newdata", - cleaning_data=cleaning_data) # This uses all the epc records to clean the data + prepared_epc = EPCRecord( + epc_records=epc_records, + run_mode="newdata", + cleaning_data=cleaning_data + ) input_properties.append( Property( @@ -173,8 +173,6 @@ async def trigger_plan(body: PlanTriggerRequest): "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET } ) - # all_predictions["heat_demand_predictions"]= all_predictions["sap_change_predictions"].copy() - # all_predictions["carbon_change_predictions"] = all_predictions["sap_change_predictions"].copy() # Insert the predictions into the recommendations and run the optimiser logger.info("Optimising recommendations") @@ -310,10 +308,6 @@ async def trigger_plan(body: PlanTriggerRequest): } ) - # all_combined_predictions["heat_demand_predictions"]= all_combined_predictions["sap_change_predictions"].copy() - # all_combined_predictions["carbon_change_predictions"] = all_combined_predictions[ - # "sap_change_predictions"].copy() - # We update the carbon and heat demand predictions for property_id, property_recommendations in recommendations.items(): combined_heat_demand = all_combined_predictions["heat_demand_predictions"] diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 801a9456..5dfeea1a 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -33,7 +33,6 @@ NO_SUFFIX_COMPONENT_COLS = [x.lower() for x in NO_SUFFIX_COMPONENT_COLS] ENDING_SUFFIX_COMPONENT_COLS = [x.lower() for x in ENDING_SUFFIX_COMPONENT_COLS] POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS] - # These lookups are used to clean the construction age band construction_age_bounds_map = { "England and Wales: before 1900": {"l": 0, "u": 1899}, @@ -74,7 +73,8 @@ class EPCDataProcessor: Handle data loading and data preprocessing """ - def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None, run_mode: str = "training", violation_mode: bool = False) -> None: + def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None, + run_mode: str = "training", violation_mode: bool = False) -> None: """ :param filepath: If specified, is the physical location of the data :param is_newdata: Indicates if we are processing new, testing data. @@ -82,23 +82,23 @@ class EPCDataProcessor: want to perform, such as confine_data() """ is_data_a_dataframe = isinstance(data, pd.DataFrame) - self.data : pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame() + self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame() is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame) - self.cleaning_averages : pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame() + self.cleaning_averages: pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame() # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA self.violation_mode = violation_mode if run_mode not in ["training", "newdata"]: raise ValueError("Run mode must be either training or newdata") self.run_mode = run_mode if not violation_mode else "newdata" - + def prepare_data(self, filepath: Path | str | None = None) -> None: """ Given the run mode, we apply the relevant pipeline steps Ignore step is used to highlight which steps are not needed in newdata """ - + ignore_step = True if self.run_mode == "newdata" else False if filepath is not None: @@ -126,7 +126,7 @@ class EPCDataProcessor: self.fill_na_fields() self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step) - + # Final re-casting after data transformed and prepared self.recast_df_columns(column_mappings=COLUMNTYPES, auto_subset_columns=True) self.recast_all_data(column_mappings=COLUMNTYPES, auto_subset_columns=True) @@ -137,32 +137,36 @@ class EPCDataProcessor: self.make_cleaning_averages(ignore_step=ignore_step) # TODO: check if this has impact on training dataset - cleaned_data = self.apply_averages_cleaning( - data_to_clean=self.data, - cleaning_data=self.cleaning_averages, - cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], - colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], - ) + # cleaned_data = self.apply_averages_cleaning( + # data_to_clean=self.data, + # cleaning_data=self.cleaning_averages, + # cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + # colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + # ) + + # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper + cleaning_averages = self.cleaning_averages.copy() + if self.run_mode == "newdata": + cleaning_averages.columns = cleaning_averages.columns.str.upper() cleaned_data = self.apply_averages_cleaning( - data_to_clean=self.data, - cleaning_data=self.cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON, - ) - + data_to_clean=self.data, + cleaning_data=cleaning_averages, + cols_to_merge_on=COLUMNS_TO_MERGE_ON, + ) + self.data = self.data if cleaned_data is None else cleaned_data self.add_local_authority_to_cleaning_average(ignore_step=ignore_step) self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) self.cast_data_columns_to_lower() - def cast_data_columns_to_lower(self): """ Convert all columns names to lower """ self.data.columns = self.data.columns.str.lower() - + def cast_cleaning_averages_columns_to_lower(self, ignore_step: bool = False): """ Convert all column names to lower @@ -171,9 +175,9 @@ class EPCDataProcessor: if ignore_step: return - + self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower() - + def add_local_authority_to_cleaning_average(self, ignore_step: bool = False): """ Add the Local authority column to the cleaning averages @@ -182,7 +186,7 @@ class EPCDataProcessor: if ignore_step: return - + self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0] def fill_invalid_constituency_fields(self, ignore_step: bool = False): @@ -195,7 +199,7 @@ class EPCDataProcessor: if ignore_step: return - + self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}) def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False): @@ -218,7 +222,6 @@ class EPCDataProcessor: for col in convert_to_lower: self.data[col] = self.data[col].str.lower() - def remap_build_form(self): """ Remap build form to standard values @@ -226,7 +229,6 @@ class EPCDataProcessor: """ self.data["BUILT_FORM"] = self.data["BUILT_FORM"].replace(BUILT_FORM_REMAP) - def remap_anomalies(self): """ Remap anomalies to None @@ -258,7 +260,7 @@ class EPCDataProcessor: if ignore_step: return - + self.data["FLOOR_LEVEL"] = self.data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP) def load_data(self, filepath, low_memory=False) -> None: @@ -404,7 +406,8 @@ class EPCDataProcessor: # self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) # # Final re-casting after data transformed and prepared - # coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES + # coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else + # COLUMNTYPES # for k, v in coltypes.items(): # self.data[k] = self.data[k].astype(v) # self.data = self.data.astype(coltypes) @@ -423,7 +426,7 @@ class EPCDataProcessor: # cleaning_data=self.cleaning_averages, # cols_to_merge_on=COLUMNS_TO_MERGE_ON # ) - + # self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0] # self.cleaning_averages.columns = self.cleaning_averages.columns.str.lower() @@ -431,7 +434,6 @@ class EPCDataProcessor: # return self.data, self.cleaning_averages - def na_remapping(self, auto_subset_columns: bool = False): fill_na_map_apply = { @@ -578,7 +580,7 @@ class EPCDataProcessor: if self.violation_mode: # TODO: to fill in return - + if ignore_step: return @@ -604,15 +606,15 @@ class EPCDataProcessor: self.data[key] = self.data[key].astype(value) else: self.data[key] = self.data[key].astype(values) - + def recast_all_data(self, column_mappings: dict, auto_subset_columns: bool = False) -> None: """ Using a dictionary to recast all columns at once - """ + """ if auto_subset_columns: column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns} - + self.data = self.data.astype(column_mappings) def confine_data(self, ignore_step: bool = False): @@ -642,7 +644,7 @@ class EPCDataProcessor: violation_missing_hotwater_description, violation_missing_roof_description, violation_invalid_property_type, - ], axis=1, + ], axis=1, keys=[ "violation_uprn_missing", "violation_old_lodgment_date", @@ -654,8 +656,8 @@ class EPCDataProcessor: "violation_missing_roof_description", "violation_invalid_property_type", ] - ) - + ) + self.data = pd.concat([self.data, violation_df], axis=1) if ignore_step: @@ -703,7 +705,7 @@ class EPCDataProcessor: if self.violation_mode: # TODO: return - + if ignore_step: return @@ -721,7 +723,8 @@ class EPCDataProcessor: self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0) @staticmethod - def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False): + def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, + ignore_step: bool = False): """ Clean the input DataFrame using averages from a cleaning DataFrame. From 1699102cd9a6b5357e04390980899fb8e4b29178 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 18 Jan 2024 15:12:00 +0000 Subject: [PATCH 02/48] added tests for clean_ventilation --- backend/app/plan/router.py | 26 +++++++++ etl/epc/Record.py | 7 ++- etl/epc/tests/test_epcrecord.py | 98 +++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 2 deletions(-) create mode 100644 etl/epc/tests/test_epcrecord.py diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index b3d1c623..d869bcb5 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -164,6 +164,32 @@ async def trigger_plan(body: PlanTriggerRequest): model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at) + recommendations_scoring_data.head() + z = recommendations_scoring_data[recommendations_scoring_data["uprn"] == 100070505235].copy() + z = z[z["roof_thermal_transmittance"] != z["roof_thermal_transmittance_ending"]] + z["roof_thermal_transmittance_ending"] = 0.4 + z["roof_energy_eff_ending"] = "Average" + + now = model_api.predict_all( + df=z, + bucket=get_settings().DATA_BUCKET, + prediction_buckets={ + "sap_change_predictions": get_settings().SAP_PREDICTIONS_BUCKET, + "heat_demand_predictions": get_settings().HEAT_PREDICTIONS_BUCKET, + "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET + } + ) + + now["sap_change_predictions"] + input_properties[1].data["mechanical-ventilation"] + # id predictions property_id recommendation_id + # 0 3696+9 56.3 3696 9 + # 1 3696+10 56.8 3696 10 + # 2 3696+11 56.3 3696 11 + # 3 3696+12 56.8 3696 12 + # With good rather than very good + now["sap_change_predictions"] + all_predictions = model_api.predict_all( df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET, diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 70586749..f1dde43e 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -635,8 +635,11 @@ class EPCRecord: This method will clean the ventilation, if empty or invalid """ self.prepared_epc['mechanical-ventilation'] = None if ( - self.mechanical_ventilation == "" or self.mechanical_ventilation in DATA_ANOMALY_MATCHES) else ( - self.mechanical_ventilation) + (self.prepared_epc['mechanical-ventilation'] == "") or + (self.prepared_epc['mechanical-ventilation'] in DATA_ANOMALY_MATCHES) + ) else ( + self.prepared_epc['mechanical-ventilation'] + ) def _field_validation(self): """ diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py new file mode 100644 index 00000000..f55bd30a --- /dev/null +++ b/etl/epc/tests/test_epcrecord.py @@ -0,0 +1,98 @@ +import pytest +from utils.s3 import read_dataframe_from_s3_parquet +from etl.epc.Record import EPCRecord +from unittest.mock import Mock + + +class TestEpcRecord: + + @pytest.fixture() + def cleaning_data(self): + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + return cleaning_data + + @pytest.fixture() + def epc_records_1(self): + epc_records_1 = { + 'original_epc': { + 'low-energy-fixed-light-count': '', 'address': '139 School Road, Hall Green', + 'uprn-source': 'Energy Assessor', 'floor-height': '2.6', 'heating-cost-potential': '1138', + 'unheated-corridor-length': '', 'hot-water-cost-potential': '175', + 'construction-age-band': 'England and Wales: 1900-1929', 'potential-energy-rating': 'B', + 'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Very Good', + 'environment-impact-potential': '82', 'glazed-type': 'double glazing, unknown install date', + 'heating-cost-current': '2711', 'address3': '', + 'mainheatcont-description': 'Programmer, TRVs and bypass', + 'sheating-energy-eff': 'N/A', 'property-type': 'House', 'local-authority-label': 'Birmingham', + 'fixed-lighting-outlets-count': '11', 'energy-tariff': 'Single', 'mechanical-ventilation': 'natural', + 'hot-water-cost-current': '310', 'county': '', 'postcode': 'B28 8JF', 'solar-water-heating-flag': 'N', + 'constituency': 'E14000562', 'co2-emissions-potential': '2.0', 'number-heated-rooms': '4', + 'floor-description': 'Suspended, no insulation (assumed)', 'energy-consumption-potential': '107', + 'local-authority': 'E08000025', 'built-form': 'Semi-Detached', 'number-open-fireplaces': '0', + 'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', 'inspection-date': '2023-07-05', + 'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '65', 'address1': '139 School Road', + 'heat-loss-corridor': '', 'flat-storey-count': '', 'constituency-label': 'Birmingham, Hall Green', + 'roof-energy-eff': 'Average', 'total-floor-area': '103.0', 'building-reference-number': '10004697322', + 'environment-impact-current': '43', 'co2-emissions-current': '6.7', + 'roof-description': 'Pitched, 100 mm loft insulation', 'floor-energy-eff': 'N/A', + 'number-habitable-rooms': '4', 'address2': 'Hall Green', 'hot-water-env-eff': 'Good', + 'posttown': 'BIRMINGHAM', 'mainheatc-energy-eff': 'Average', 'main-fuel': 'mains gas (not community)', + 'lighting-env-eff': 'Very Good', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', + 'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 82% of fixed outlets', + 'roof-env-eff': 'Average', 'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0', + 'lighting-cost-potential': '182', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100', + 'main-heating-controls': '', 'lodgement-datetime': '2023-07-13 08:23:07', 'flat-top-storey': '', + 'current-energy-rating': 'E', 'secondheat-description': 'None', 'walls-env-eff': 'Very Poor', + 'transaction-type': 'rental', 'uprn': '100070505235', 'current-energy-efficiency': '51', + 'energy-consumption-current': '366', 'mainheat-description': 'Boiler and radiators, mains gas', + 'lighting-cost-current': '182', 'lodgement-date': '2023-07-13', 'extension-count': '0', + 'mainheatc-env-eff': 'Average', + 'lmk-key': 'c1d137711da433fb3cced74b1a6848da8bbc1159d076455d26d7b4668982601e', + 'wind-turbine-count': '0', + 'tenure': 'Rented (social)', 'floor-level': '', 'potential-energy-efficiency': '84', + 'hot-water-energy-eff': 'Good', 'low-energy-lighting': '82', + 'walls-description': 'Solid brick, as built, no insulation (assumed)', + 'hotwater-description': 'From main system'}, 'full_sap_epc': {}, 'old_data': [] + } + return epc_records_1 + + def test_clean_mechanical_ventilation(self, cleaning_data, epc_records_1): + # We have an epc with Natural ventilation - the resulting epc should also have natural ventulation + + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "mechanical-ventilation": "natural" + } + record._clean_ventilation() + + assert record.prepared_epc["mechanical-ventilation"] == "natural" + + record2 = EPCRecord(cleaning_data=cleaning_data) + record2.prepared_epc = { + "mechanical-ventilation": "" + } + + record2._clean_ventilation() + + assert record2.prepared_epc["mechanical-ventilation"] is None + + record3 = EPCRecord(cleaning_data=cleaning_data) + record3.prepared_epc = { + "mechanical-ventilation": None + } + + record3._clean_ventilation() + + assert record3.prepared_epc["mechanical-ventilation"] is None + + record4 = EPCRecord(cleaning_data=cleaning_data) + record4.prepared_epc = { + "mechanical-ventilation": "INVALID" + } + + record4._clean_ventilation() + + assert record4.prepared_epc["mechanical-ventilation"] is None From dbe13586da99dbbd28a126eb02537c8987564faf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 18 Jan 2024 18:10:24 +0000 Subject: [PATCH 03/48] creating unit tests, added test cases for router --- backend/Property.py | 5 +- backend/app/plan/router.py | 36 ++------ backend/ml_models/Valuation.py | 10 +- etl/epc/DataProcessor.py | 5 +- etl/epc/Record.py | 43 ++++++--- etl/epc/tests/test_epcrecord.py | 158 +++++++++++++++++++++++++++++++- 6 files changed, 207 insertions(+), 50 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 98325b15..c1055eb9 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -222,7 +222,10 @@ class Property(Definitions): proposed_depth = min(valid_numeric_values, key=lambda x: abs(x - proposed_depth)) recommendation_record["roof_insulation_thickness_ending"] = str(proposed_depth) - recommendation_record["roof_energy_eff_ending"] = "Very Good" + if recommendation["type"] == "loft_insulation": + recommendation_record["roof_energy_eff_ending"] = "Good" + else: + recommendation_record["roof_energy_eff_ending"] = "Very Good" else: # Fill missing roof u-values - this fill is not based on recommended upgrades if recommendation_record["roof_thermal_transmittance_ending"] is None: diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index d869bcb5..521ec615 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -65,6 +65,16 @@ async def trigger_plan(body: PlanTriggerRequest): bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) + # For testing: + # plan_input.extend( + # [ + # {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, + # {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, + # {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, + # {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, + # ] + # ) + input_properties = [] for config in plan_input: # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -164,32 +174,6 @@ async def trigger_plan(body: PlanTriggerRequest): model_api = ModelApi(portfolio_id=body.portfolio_id, timestamp=created_at) - recommendations_scoring_data.head() - z = recommendations_scoring_data[recommendations_scoring_data["uprn"] == 100070505235].copy() - z = z[z["roof_thermal_transmittance"] != z["roof_thermal_transmittance_ending"]] - z["roof_thermal_transmittance_ending"] = 0.4 - z["roof_energy_eff_ending"] = "Average" - - now = model_api.predict_all( - df=z, - bucket=get_settings().DATA_BUCKET, - prediction_buckets={ - "sap_change_predictions": get_settings().SAP_PREDICTIONS_BUCKET, - "heat_demand_predictions": get_settings().HEAT_PREDICTIONS_BUCKET, - "carbon_change_predictions": get_settings().CARBON_PREDICTIONS_BUCKET - } - ) - - now["sap_change_predictions"] - input_properties[1].data["mechanical-ventilation"] - # id predictions property_id recommendation_id - # 0 3696+9 56.3 3696 9 - # 1 3696+10 56.8 3696 10 - # 2 3696+11 56.3 3696 11 - # 3 3696+12 56.8 3696 12 - # With good rather than very good - now["sap_change_predictions"] - all_predictions = model_api.predict_all( df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET, diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 018b4678..dadef9a9 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -96,11 +96,11 @@ class PropertyValuation: if not value: return { - "current_value": None, - "lower_bound_increased_value": None, - "upper_bound_increased_value": None, - "average_increased_value": None, - "average_increase": None + "current_value": 0, + "lower_bound_increased_value": 0, + "upper_bound_increased_value": 0, + "average_increased_value": 0, + "average_increase": 0 } current_epc = property_instance.data["current-energy-rating"] diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 5dfeea1a..4c4651a4 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -723,8 +723,9 @@ class EPCDataProcessor: self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0) @staticmethod - def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, - ignore_step: bool = False): + def apply_averages_cleaning( + data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False + ): """ Clean the input DataFrame using averages from a cleaning DataFrame. diff --git a/etl/epc/Record.py b/etl/epc/Record.py index f1dde43e..4474baf1 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -380,13 +380,21 @@ class EPCRecord: else: # Use averages from the cleaning dataset, based on the property type, built form, construction age # band and local authority + + cleaning_data = self.cleaning_data.copy() + # When running in new-data more, the columns will have been coerced to lower case so we push them + # back to upper case + if self.run_mode == "newdata": + cleaning_data.columns = [x.upper() for x in cleaning_data.columns] + cleaned_property_data = EPCDataProcessor.apply_averages_cleaning( data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True), cleaning_data=self.cleaning_data, cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], ) self.prepared_epc["fixed-lighting-outlets-count"] = round( - cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0]) + cleaned_property_data["FIXED_LIGHTING_OUTLETS_COUNT"].values[0] + ) else: self.prepared_epc["fixed-lighting-outlets-count"] = float(self.prepared_epc["fixed-lighting-outlets-count"]) @@ -460,14 +468,14 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - map = { + mains_gas_map = { "Y": True, "N": False, } self.prepared_epc["mains-gas-flag"] = None if ( self.prepared_epc["mains-gas-flag"] == "" or self.prepared_epc["mains-gas-flag"] in DATA_ANOMALY_MATCHES - ) else map[self.prepared_epc["mains-gas-flag"]] + ) else mains_gas_map[self.prepared_epc["mains-gas-flag"]] def _clean_heat_loss_corridor(self): """ @@ -476,15 +484,18 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - map = { - "no corridor": False, - "unheated corridor": True, - "heated corridor": False - } + valid_values = [ + "no corridor", + "unheated corridor", + "heated corridor" + ] - self.prepared_epc["heat-loss-corridor"] = False if self.prepared_epc[ - "heat-loss-corridor"] in DATA_ANOMALY_MATCHES else map[ - self.prepared_epc["heat-loss-corridor"]] + self.prepared_epc["heat-loss-corridor"] = ( + "no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else + self.prepared_epc["heat-loss-corridor"] + ) + if self.prepared_epc["heat-loss-corridor"] not in valid_values: + self.prepared_epc["heat-loss-corridor"] = "no corridor" self.prepared_epc["unheated-corridor-length"] = ( float(self.prepared_epc["unheated-corridor-length"]) if @@ -572,11 +583,13 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get(self.prepared_epc["built-form"], - self.prepared_epc["built-form"]) + self.prepared_epc['built-form'] = BUILT_FORM_REMAP.get( + self.prepared_epc["built-form"], self.prepared_epc["built-form"] + ) + if self.prepared_epc["built-form"] in DATA_ANOMALY_MATCHES: - if self.prepared_epc["property-type"] == "Flat": - self.prepared_epc["built-form"] = "Semi-Detached" + if self.prepared_epc["property-type"] in ["Flat", "Maisonette"]: + self.prepared_epc["built-form"] = "End-Terrace" def _clean_age_band(self): """ diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py index f55bd30a..06b8ed06 100644 --- a/etl/epc/tests/test_epcrecord.py +++ b/etl/epc/tests/test_epcrecord.py @@ -1,7 +1,8 @@ import pytest from utils.s3 import read_dataframe_from_s3_parquet from etl.epc.Record import EPCRecord -from unittest.mock import Mock +from etl.epc.settings import DATA_ANOMALY_MATCHES +import random class TestEpcRecord: @@ -96,3 +97,158 @@ class TestEpcRecord: record4._clean_ventilation() assert record4.prepared_epc["mechanical-ventilation"] is None + + def test_clean_energy_valid_values(self, cleaning_data, epc_records_1): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "energy-consumption-current": "200", + "co2-emissions-current": "5.5" + } + record._clean_energy() + + assert record.prepared_epc["energy-consumption-current"] == 200.0 + assert record.prepared_epc["co2-emissions-current"] == 5.5 + + def test_clean_energy_empty_values(self, cleaning_data, epc_records_1): + # We cannot have invalid values so this should raise an exception + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "energy-consumption-current": "", + "co2-emissions-current": "" + } + record._clean_energy() + + with pytest.raises(ValueError): + record._clean_energy() + + def test_clean_built_form_valid_remap(self, cleaning_data, epc_records_1): + record = EPCRecord(cleaning_data=cleaning_data) + # Assuming "Semi" should be remapped to "Semi-Detached" + record.prepared_epc = { + "built-form": "Semi-Detached", + "property-type": "Flat" # Assuming this affects the remapping + } + record._clean_built_form() + + assert record.prepared_epc["built-form"] == "Semi-Detached" + + def test_clean_built_form_anomaly(self, cleaning_data, epc_records_1): + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "built-form": "", + "property-type": "Flat" + } + record._clean_built_form() + + assert record.prepared_epc["built-form"] == "End-Terrace" + + def test_clean_floor_area_valid(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "total-floor-area": "120.5" + } + record._clean_floor_area() + + assert record.prepared_epc["total-floor-area"] == 120.5 + + def test_clean_floor_area_empty(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "total-floor-area": "" + } + # We have no known case of missing floor area + with pytest.raises(ValueError): + record._clean_floor_area() + + def test_clean_heat_loss_corridor_valid(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "heat-loss-corridor": "unheated corridor", + "unheated-corridor-length": "" + } + record._clean_heat_loss_corridor() + + assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor" + + def test_clean_heat_loss_corridor_anomaly(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + # Assuming "InvalidCorridor" is an anomaly + record.prepared_epc = { + "heat-loss-corridor": "InvalidCorridor", + "unheated-corridor-length": "" + } + record._clean_heat_loss_corridor() + + assert record.prepared_epc["heat-loss-corridor"] == "no corridor" + + def test_clean_mains_gas_valid(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "mains-gas-flag": "Y" + } + record._clean_mains_gas() + + assert record.prepared_epc["mains-gas-flag"] is True + + def test_clean_mains_gas_anomaly(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "mains-gas-flag": "InvalidValue" + } + # It should always be Y or N or an anomally value + with pytest.raises(ValueError): + record._clean_mains_gas() + + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "mains-gas-flag": random.choice(list(DATA_ANOMALY_MATCHES)) + } + record._clean_mains_gas() + + assert record.prepared_epc["mains-gas-flag"] is None + + def test_clean_solar_hot_water_valid(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "solar-water-heating-flag": "Y" + } + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is True + + def test_clean_solar_hot_water_empty(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "solar-water-heating-flag": "" + } + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is None + + def test_clean_number_lighting_outlets_valid(self, cleaning_data, epc_records_1): + record = EPCRecord(cleaning_data=cleaning_data, epc_records=epc_records_1) + record.prepared_epc = { + "fixed-lighting-outlets-count": "5" + } + record._clean_number_lighting_outlets() + + assert record.prepared_epc["fixed-lighting-outlets-count"] == 5.0 + + def test_clean_number_lighting_outlets_empty(self, cleaning_data, epc_records_1): + record = EPCRecord(cleaning_data=cleaning_data) + record.run_mode = "newdata" + record.prepared_epc = { + "fixed-lighting-outlets-count": "", + "property-type": "Flat", + "built-form": "Semi-Detached", + "construction-age-band": "England and Wales: 1900-1929", + "local-authority": "E08000025", + "number-habitable-rooms": "4", + "number-heated-rooms": "4", + } + record.old_data = [] + record.full_sap_epc = [] + record._clean_number_lighting_outlets() + + assert record.prepared_epc["fixed-lighting-outlets-count"] == 8.0 From 86dd6efdc387a2b6c67a9244db3382d6fe7896ab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 11:27:51 +0000 Subject: [PATCH 04/48] fixed bug in lighting outlets cleaning --- backend/app/plan/router.py | 20 ++++++++++++-------- etl/epc/Record.py | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 521ec615..d3471e8f 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -66,14 +66,18 @@ async def trigger_plan(body: PlanTriggerRequest): ) # For testing: - # plan_input.extend( - # [ - # {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, - # {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, - # {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, - # {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, - # ] - # ) + plan_input.extend( + [ + {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, + {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, + {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, + {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, + {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''}, + {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''}, + {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''}, + {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''}, + ] + ) input_properties = [] for config in plan_input: diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 4474baf1..cdbafd7e 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -389,7 +389,7 @@ class EPCRecord: cleaned_property_data = EPCDataProcessor.apply_averages_cleaning( data_to_clean=self.epc_record_as_dataframe("prepared_epc", replace_empty_string=True), - cleaning_data=self.cleaning_data, + cleaning_data=cleaning_data, cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], ) self.prepared_epc["fixed-lighting-outlets-count"] = round( From e7c0b9169cffafef4898131d4e3fc0e4e4421827 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 11:34:09 +0000 Subject: [PATCH 05/48] fixing datetime bug in SearchEpc --- backend/SearchEpc.py | 2 +- backend/app/plan/router.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index d69d8d86..4f6fd33d 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -472,7 +472,7 @@ class SearchEpc: if not epc_data.empty: # Further processing of the EPC data - epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed') + epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce') epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1) epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1)) epc_data["numeric_house_number"] = epc_data["house_number"].apply( diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index d3471e8f..39944fe3 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -66,18 +66,18 @@ async def trigger_plan(body: PlanTriggerRequest): ) # For testing: - plan_input.extend( - [ - {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, - {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, - {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, - {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, - {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''}, - {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''}, - {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''}, - {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''}, - ] - ) + # plan_input.extend( + # [ + # {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, + # {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, + # {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, + # {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, + # {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''}, + # {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''}, + # {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''}, + # {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''}, + # ] + # ) input_properties = [] for config in plan_input: From 804e8fb720e473b746e4491a0a5e0700fc486d90 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 12:00:47 +0000 Subject: [PATCH 06/48] handling Epc Record when the EPC has been interpolates --- etl/epc/Record.py | 30 +++++----- etl/epc/tests/test_epcrecord.py | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 14 deletions(-) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index cdbafd7e..2535f204 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -349,7 +349,7 @@ class EPCRecord: self.prepared_epc["floor-level"] = ( FLOOR_LEVEL_MAP[self.prepared_epc["floor-level"]] if - self.prepared_epc["floor-level"] not in DATA_ANOMALY_MATCHES else None + self.prepared_epc["floor-level"] not in list(DATA_ANOMALY_MATCHES) + ["", None] else None ) def _clean_number_lighting_outlets(self): @@ -499,7 +499,7 @@ class EPCRecord: self.prepared_epc["unheated-corridor-length"] = ( float(self.prepared_epc["unheated-corridor-length"]) if - self.prepared_epc["unheated-corridor-length"] != "" else None + self.prepared_epc["unheated-corridor-length"] not in ["", None] else None ) def _clean_count_variables(self): @@ -509,18 +509,18 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Record doesn not contain epc data") - fields = { - "number_of_open_fireplaces": "number-open-fireplaces", - "number_of_extensions": "extension-count", - "number_of_storeys": "flat-storey-count", - "number_of_rooms": "number-habitable-rooms", - } + fields = [ + "number-open-fireplaces", + "extension-count", + "flat-storey-count", + "number-habitable-rooms" + ] - null_attributes = ["number_of_storeys", "number_of_rooms"] + null_attributes = ["flat-storey-count", "number-habitable-rooms"] - for attribute, epc_field in fields.items(): - value = self.prepared_epc[epc_field] - if value == "" or value in DATA_ANOMALY_MATCHES: + for attribute in fields: + value = self.prepared_epc[attribute] + if value in ["", None] or value in DATA_ANOMALY_MATCHES: if attribute in null_attributes: value = None else: @@ -537,8 +537,9 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['wind-turbine-count'] = int(self.prepared_epc['wind-turbine-count']) if self.prepared_epc[ - 'wind-turbine-count'] != "" else None + self.prepared_epc['wind-turbine-count'] = int( + self.prepared_epc['wind-turbine-count'] + ) if self.prepared_epc['wind-turbine-count'] not in ["", None] else None def _clean_solar_hot_water(self): """ @@ -551,6 +552,7 @@ class EPCRecord: "Y": True, "N": False, "": None, + None: None } self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']] diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py index 06b8ed06..48ad5148 100644 --- a/etl/epc/tests/test_epcrecord.py +++ b/etl/epc/tests/test_epcrecord.py @@ -171,6 +171,16 @@ class TestEpcRecord: assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor" + record = EPCRecord(cleaning_data=cleaning_data) + record.prepared_epc = { + "heat-loss-corridor": "unheated corridor", + "unheated-corridor-length": None + } + record._clean_heat_loss_corridor() + + assert record.prepared_epc["heat-loss-corridor"] == "unheated corridor" + assert record.prepared_epc["unheated-corridor-length"] is None + def test_clean_heat_loss_corridor_anomaly(self, cleaning_data): record = EPCRecord(cleaning_data=cleaning_data) # Assuming "InvalidCorridor" is an anomaly @@ -252,3 +262,92 @@ class TestEpcRecord: record._clean_number_lighting_outlets() assert record.prepared_epc["fixed-lighting-outlets-count"] == 8.0 + + def test_clean_count_variables(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "number-open-fireplaces": "1", + "extension-count": None, + "flat-storey-count": "", + "number-habitable-rooms": "INVALID!", + } + + record._clean_count_variables() + + assert record.prepared_epc["number-open-fireplaces"] == 1.0 + assert record.prepared_epc["extension-count"] == 0 + assert record.prepared_epc["flat-storey-count"] is None + assert record.prepared_epc["number-habitable-rooms"] is None + + def test_clean_floor_level(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "floor-level": "1", + } + + record._clean_floor_level() + + assert record.prepared_epc["floor-level"] == 1.0 + + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "floor-level": "", + } + + record._clean_floor_level() + + assert record.prepared_epc["floor-level"] is None + + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "floor-level": None, + } + + record._clean_floor_level() + + assert record.prepared_epc["floor-level"] is None + + def test_clean_solar_hot_water(self, cleaning_data): + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "solar-water-heating-flag": "Y", + } + + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is True + + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "solar-water-heating-flag": "N", + } + + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is False + + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "solar-water-heating-flag": "", + } + + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is None + + record = EPCRecord(cleaning_data=cleaning_data) + + record.prepared_epc = { + "solar-water-heating-flag": None, + } + + record._clean_solar_hot_water() + + assert record.prepared_epc["solar-water-heating-flag"] is None From 3cf13c651cb1f16297df94a63a161663f20b490b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 13:36:36 +0000 Subject: [PATCH 07/48] changing some of the gets to [] gets in Property class --- backend/Property.py | 22 +++++++++++----------- backend/app/plan/router.py | 2 ++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index c1055eb9..736ab4f1 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -68,7 +68,7 @@ class Property(Definitions): self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None self.restricted_measures = False self.year_built = epc_record.get("year_built") - self.number_of_rooms = epc_record.prepared_epc.get("number_of_rooms") + self.number_of_rooms = epc_record.prepared_epc["number_heated_rooms"] self.age_band = epc_record.get("age_band") self.construction_age_band = epc_record.get("construction_age_band") self.number_of_floors = epc_record.get("number_of_floors") @@ -81,7 +81,7 @@ class Property(Definitions): "co2_emissions": epc_record.get("co2_emissions_current"), } self.ventilation = { - "ventilation": epc_record.get("mechanical_ventilation"), + "ventilation": epc_record.prepared_epc["mechanical_ventilation"], } self.solar_pv = { "solar_pv": epc_record.get("photo_supply"), @@ -90,28 +90,28 @@ class Property(Definitions): "solar_hot_water": epc_record.get("solar_water_heating_flag"), } self.wind_turbine = { - "wind_turbine": epc_record.prepared_epc.get("wind_turbine_count"), + "wind_turbine": epc_record.prepared_epc["wind_turbine_count"], } self.number_of_open_fireplaces = { - "number_of_open_fireplaces": epc_record.prepared_epc.get("number_of_open_fireplaces"), + "number_of_open_fireplaces": epc_record.prepared_epc["number_open_fireplaces"], } self.number_of_extensions = { - "number_of_extensions": epc_record.prepared_epc.get("number_of_extensions"), + "number_of_extensions": epc_record.prepared_epc["extension_count"], } self.number_of_storeys = { - "number_of_storeys": epc_record.prepared_epc.get("number_of_storeys"), + "number_of_storeys": epc_record.prepared_epc["flat_storey_count"], } self.heat_loss_corridor = { - "heat_loss_corridor": epc_record.prepared_epc.get("heat_loss_corridor"), - "length": epc_record.prepared_epc.get("unheated_corridor_length"), + "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"], + "length": epc_record.prepared_epc["unheated_corridor_length"], } - self.mains_gas = epc_record.prepared_epc.get('mains_gas_flag') - self.floor_height = epc_record.prepared_epc.get('floor_height') + self.mains_gas = epc_record.prepared_epc['mains_gas_flag'] + self.floor_height = epc_record.prepared_epc['floor_height'] self.insulation_wall_area = None self.floor_area = epc_record.prepared_epc.get('total_floor_area') self.pitched_roof_area = None self.insulation_floor_area = None - self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count") + self.number_lighting_outlets = epc_record.prepared_epc["fixed_lighting_outlets_count"] self.floor_level = None self.number_of_windows = None self.solar_pv_roof_area = None diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 39944fe3..6e9c4f50 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -159,6 +159,8 @@ async def trigger_plan(body: PlanTriggerRequest): recommender = Recommendations(property_instance=p, materials=materials) property_recommendations = recommender.recommend() + recommender.wall_recomender.estimated_u_value + if not property_recommendations: continue From 807e6d5047dcfce2bb1a2e4bf9f548ebc419b01a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 16:28:35 +0000 Subject: [PATCH 08/48] align processing of solar hot water flag between engine and model --- backend/app/plan/router.py | 16 ---------------- etl/epc/Record.py | 8 ++++---- recommendations/FloorRecommendations.py | 1 + recommendations/RoofRecommendations.py | 1 + 4 files changed, 6 insertions(+), 20 deletions(-) diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 6e9c4f50..b3d1c623 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -65,20 +65,6 @@ async def trigger_plan(body: PlanTriggerRequest): bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", ) - # For testing: - # plan_input.extend( - # [ - # {'address': '73 Long Chaulden', 'postcode': 'HP1 2HX', 'Notes': ''}, - # {'address': '8 Lindlings', 'postcode': 'HP1 2HA', 'Notes': ''}, - # {'address': '44 Lindlings', 'postcode': 'HP1 2HE', 'Notes': ''}, - # {'address': '46 Chaulden Terrace', 'postcode': 'HP1 2AN', 'Notes': ''}, - # {'address': '4, Heather Shaw', 'postcode': 'BA14 7JS', 'Notes': ''}, - # {'address': '16 Glastonbury Road', 'postcode': 'M32 9PE', 'Notes': ''}, - # {'address': '31 Loddon Way', 'postcode': 'BA15 1HG', 'Notes': ''}, - # {'address': '62 Pearmain Drive', 'postcode': 'NG3 3DJ', 'Notes': ''}, - # ] - # ) - input_properties = [] for config in plan_input: # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -159,8 +145,6 @@ async def trigger_plan(body: PlanTriggerRequest): recommender = Recommendations(property_instance=p, materials=materials) property_recommendations = recommender.recommend() - recommender.wall_recomender.estimated_u_value - if not property_recommendations: continue diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 2535f204..1c6d694d 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -549,10 +549,10 @@ class EPCRecord: raise ValueError("EPC Recrod doesn not contain epc data") value_map = { - "Y": True, - "N": False, - "": None, - None: None + "Y": "Y", + "N": "N", + "": "N", + None: "N" } self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']] diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index a246c8cb..2f568264 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -109,6 +109,7 @@ class FloorRecommendations(Definitions): insulation_thickness=self.property.floor["insulation_thickness"], wall_type=self.property.wall_type ) + self.estimated_u_value = u_value if u_value < self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index dc1aff3f..0bbfd69d 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -91,6 +91,7 @@ class RoofRecommendations: raise NotImplementedError("Implement me") u_value = get_roof_u_value(**{**self.property.roof, "age_band": self.property.age_band}) + self.estimated_u_value = u_value if u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: # The Roof is already compliant From 24709b98d604fe93edfcbb959c728dbf9dac60f7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 16:51:49 +0000 Subject: [PATCH 09/48] Added more wall u-value tests --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/Property.py | 6 ++-- etl/epc/Record.py | 19 +++++++++++++ .../tests/test_data/wall_uvalue_test_cases.py | 28 +++++++++++++++++++ 5 files changed, 53 insertions(+), 4 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 6f308057..1122b380 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/Property.py b/backend/Property.py index 736ab4f1..ee496552 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -88,6 +88,7 @@ class Property(Definitions): } self.solar_hot_water = { "solar_hot_water": epc_record.get("solar_water_heating_flag"), + "solar_hot_water_boolean": epc_record.get("solar_water_heating_flag_bool"), } self.wind_turbine = { "wind_turbine": epc_record.prepared_epc["wind_turbine_count"], @@ -104,6 +105,7 @@ class Property(Definitions): self.heat_loss_corridor = { "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"], "length": epc_record.prepared_epc["unheated_corridor_length"], + "heat_loss_corridor_boolean": epc_record.get("heat_loss_corridor_bool"), } self.mains_gas = epc_record.prepared_epc['mains_gas_flag'] self.floor_height = epc_record.prepared_epc['floor_height'] @@ -436,10 +438,10 @@ class Property(Definitions): "mainfuel": self.main_fuel["clean_description"], "ventilation": self.ventilation["ventilation"], "solar_pv": self.solar_pv["solar_pv"], - "solar_hot_water": self.solar_hot_water["solar_hot_water"], + "solar_hot_water": self.solar_hot_water["solar_hot_water_boolean"], "wind_turbine": self.wind_turbine["wind_turbine"], "floor_height": self.floor_height, - "heat_loss_corridor": self.heat_loss_corridor["heat_loss_corridor"], + "heat_loss_corridor": self.heat_loss_corridor["heat_loss_corridor_boolean"], "unheated_corridor_length": self.heat_loss_corridor["length"], "number_of_open_fireplaces": self.number_of_open_fireplaces["number_of_open_fireplaces"], "number_of_extensions": self.number_of_extensions["number_of_extensions"], diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 1c6d694d..6fb4d5d9 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -102,6 +102,8 @@ class EPCRecord: year_built: int = None number_of_floors: int = None number_of_open_fireplaces: int = None + heat_loss_corridor_bool: bool = None + solar_water_heating_flag_bool: bool = None def __post_init__(self): # We can have validation and cleaning steps for each of the fields @@ -490,6 +492,12 @@ class EPCRecord: "heated corridor" ] + boolean_map = { + "no corridor": False, + "unheated corridor": True, + "heated corridor": False + } + self.prepared_epc["heat-loss-corridor"] = ( "no corridor" if self.prepared_epc["heat-loss-corridor"] in DATA_ANOMALY_MATCHES else self.prepared_epc["heat-loss-corridor"] @@ -502,6 +510,9 @@ class EPCRecord: self.prepared_epc["unheated-corridor-length"] not in ["", None] else None ) + # We create boolean versions of heat-loss-corridor + self.heat_loss_corridor_bool = boolean_map[self.prepared_epc["heat-loss-corridor"]] + def _clean_count_variables(self): """ This method will clean the count variables, if empty or invalid @@ -555,8 +566,16 @@ class EPCRecord: None: "N" } + boolean_map = { + "Y": True, + "N": False, + } + self.prepared_epc['solar-water-heating-flag'] = value_map[self.prepared_epc['solar-water-heating-flag']] + # Create a boolean version for storage in the database + self.solar_water_heating_flag_bool = boolean_map[self.prepared_epc['solar-water-heating-flag']] + def _clean_solar_pv(self): """ This method will clean the solar pv, if empty or invalid diff --git a/recommendations/tests/test_data/wall_uvalue_test_cases.py b/recommendations/tests/test_data/wall_uvalue_test_cases.py index e0c6ebe3..87f1ad3f 100644 --- a/recommendations/tests/test_data/wall_uvalue_test_cases.py +++ b/recommendations/tests/test_data/wall_uvalue_test_cases.py @@ -76,5 +76,33 @@ wall_uvalue_test_cases = [ "is_granite_or_whinstone": False, "is_sandstone_or_limestone": False, "uvalue": 0 + }, + { + "clean_description": "Cavity wall, as built, insulated", + "age_band": "F", + "is_granite_or_whinstone": False, + "is_sandstone_or_limestone": False, + "uvalue": 0.4 + }, + { + "clean_description": "Cavity wall, as built, insulated", + "age_band": "D", + "is_granite_or_whinstone": False, + "is_sandstone_or_limestone": False, + "uvalue": 0.7 + }, + { + "clean_description": "Cavity wall, filled cavity", + "age_band": "E", + "is_granite_or_whinstone": False, + "is_sandstone_or_limestone": False, + "uvalue": 0.7 + }, + { + "clean_description": "Cavity wall, as built, no insulation", + "age_band": "E", + "is_granite_or_whinstone": False, + "is_sandstone_or_limestone": False, + "uvalue": 1.5 } ] From 80c35d42df60e365f3a93218f6bb2affa0650dab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 17:11:06 +0000 Subject: [PATCH 10/48] Added floor uvalue cases --- .../test_data/floor_uvalue_test_cases.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/recommendations/tests/test_data/floor_uvalue_test_cases.py b/recommendations/tests/test_data/floor_uvalue_test_cases.py index 91d3814f..7104fd9d 100644 --- a/recommendations/tests/test_data/floor_uvalue_test_cases.py +++ b/recommendations/tests/test_data/floor_uvalue_test_cases.py @@ -29,4 +29,34 @@ floor_uvalue_test_cases = [ "insulation_thickness": None, "expected": ValueError, }, + # 16 Glastonbury road EPR - the EPR has 0.71 due to the property having 320mm wall thickness, but default being 250 + { + "floor_type": "suspended", + "area": 34.5, + "perimeter": 16.7, + "age_band": "D", + "wall_type": "cavity", + "insulation_thickness": None, + "expected": 0.72, + }, + # 31 Loddon Way - the EPR has 0.5 due to the property having 320mm wall thickness, but default being 250 + { + "floor_type": "solid", + "area": 52.08, + "perimeter": 16.2, + "age_band": "E", + "wall_type": "cavity", + "insulation_thickness": None, + "expected": 0.52, + }, + # 62 Pearmain Drive + { + "floor_type": "solid", + "area": 38.64, + "perimeter": 18.1, + "age_band": "E", + "wall_type": "cavity", + "insulation_thickness": None, + "expected": 0.69, + }, ] From 43f3169e0c77b15203800dfe15ea8747649fbad7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 17:25:53 +0000 Subject: [PATCH 11/48] corrected unit tests: --- backend/tests/test_sap_model_prep.py | 1000 -------------------------- etl/epc/tests/test_epcrecord.py | 23 +- 2 files changed, 14 insertions(+), 1009 deletions(-) delete mode 100644 backend/tests/test_sap_model_prep.py diff --git a/backend/tests/test_sap_model_prep.py b/backend/tests/test_sap_model_prep.py deleted file mode 100644 index 89c436ce..00000000 --- a/backend/tests/test_sap_model_prep.py +++ /dev/null @@ -1,1000 +0,0 @@ -from backend.Property import Property -from etl.epc.DataProcessor import DataProcessor -from backend.app.plan.utils import create_recommendation_scoring_data, get_cleaned -from etl.epc.settings import COLUMNS_TO_MERGE_ON -import pandas as pd -import pytest -import msgpack - -from utils.s3 import read_dataframe_from_s3_parquet, read_from_s3 - - -# Handy code for selecting testing data -# import pickle -# -# with open("sap_dataset.pickle", "rb") as f: -# sap_change_dataset = pickle.load(f) -# -# search_from = sap_change_dataset[ -# (sap_change_dataset["walls_thermal_transmittance_ENDING"] == sap_change_dataset["walls_thermal_transmittance"]) & -# sap_change_dataset["is_to_unheated_space"] -# ] -# search_from = search_from[ -# (search_from["roof_thermal_transmittance_ENDING"] == search_from["roof_thermal_transmittance"]) & -# (search_from["floor_thermal_transmittance_ENDING"] != search_from["floor_thermal_transmittance"]) & -# (search_from["MECHANICAL_VENTILATION_ENDING"] == search_from["MECHANICAL_VENTILATION_STARTING"]) & -# (search_from["SECONDHEAT_DESCRIPTION_ENDING"] == search_from["SECONDHEAT_DESCRIPTION_STARTING"]) & -# (search_from["GLAZED_TYPE_ENDING"] == search_from["GLAZED_TYPE_STARTING"]) -# ] -# -# # Find a record where the only difference is cavity wall getting filled -# ending_cols = [c for c in search_from.columns if "_ENDING" in c] -# -# ignore = [ -# "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING", "TRANSACTION_TYPE_ENDING", "FLOOR_HEIGHT_ENDING", -# "DAYS_TO_ENDING", "TOTAL_FLOOR_AREA_ENDING" -# ] -# -# ending_cols = [c for c in ending_cols if c not in ignore] -# -# for _, row in tqdm(search_from.iterrows(), total=search_from.shape[0]): -# -# same = True -# starting_cols = [] -# for c in ending_cols: -# -# starting_col = c.replace("_ENDING", "") -# if starting_col not in search_from.columns: -# starting_col = c.replace("_ENDING", "_STARTING") -# if starting_col not in search_from.columns: -# raise Exception("something went wrong") -# -# starting_cols.append(starting_col) -# -# # We want them to be different -# if c == "floor_thermal_transmittance_ENDING": -# if (row[c] == row[starting_col]) | (row[starting_col] != "natural"): -# same = False -# break -# else: -# continue -# -# # We now check if the starting and ending values are the same -# if row[c] != row[starting_col]: -# same = False -# break -# -# if same: -# raise Exception("We found one!") -# -# fixed_cols = [c for c in search_from.columns if c not in starting_cols + ending_cols] -# -# import pandas as pd -# -# start = row[["SAP_STARTING"] + starting_cols] -# start.index = [c.replace("_STARTING", "") for c in start.index] -# end = row[["SAP_ENDING"] + ending_cols] -# end.index = [c.replace("_ENDING", "") for c in end.index] -# start["type"] = "starting" -# end["type"] = "ending" -# -# compare = pd.concat([start, end], axis=1) -# -# ending_lmk = "1252008839062019090910572351658131" -# starting_lmk = "1252008819542014122308482236142128" -# -# client = EpcClient(auth_token=EPC_AUTH_TOKEN) -# result = client.domestic.search(params={"address": "Flat 14 Charles House, Freemens Way", "postcode": "CT14 9DL"}) -# starting_epc = [x for x in result["rows"] if x["lmk-key"] == starting_lmk][0] -# ending_epc = [x for x in result["rows"] if x["lmk-key"] == ending_lmk][0] - - -# with open( -# os.path.abspath(os.path.dirname(__file__)) + "/backend/tests/test_data/cleaned.pickle", "rb" -# ) as f: -# cleaned = pickle.load(f) - -# with open( -# os.path.abspath(os.path.dirname(__file__)) + "/backend/tests/test_data/cleaning_data.pickle", "rb" -# ) as f: -# cleaning_data = pickle.load(f) - -# TODO: Need to do floors, suspended and solid and to unheated space - - -class TestSapModelPrep: - - @pytest.fixture - def cleaning_data(self): - return read_dataframe_from_s3_parquet( - bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", - ) - - @pytest.fixture - def cleaned(self): - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - - cleaned = msgpack.unpackb(cleaned, raw=False) - return cleaned - - @pytest.fixture - def photo_supply_lookup(self): - photo_supply_lookup = read_dataframe_from_s3_parquet( - bucket_name="retrofit-data-dev", file_key="solar_pv_supply/photo_supply_lookup.parquet", - ) - return photo_supply_lookup - - @pytest.fixture - def floor_area_decile_thresholds(self): - floor_area_decile_thresholds = read_dataframe_from_s3_parquet( - bucket_name="retrofit-data-dev", file_key="solar_pv_supply/floor_area_decile_thresholds.parquet", - ) - return floor_area_decile_thresholds - - def test_fill_cavity_wall(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): - """ - We ensure that the process that prepares the data in the engine code results in the same data as - the model is trained on - """ - - # This is an actual starting EPC - starting_epc = { - 'low-energy-fixed-light-count': '', 'address': '26, Vicarage Lane, Eaton', - 'uprn-source': 'Address Matched', 'floor-height': '2.39', 'heating-cost-potential': '942', - 'unheated-corridor-length': '', 'hot-water-cost-potential': '97', - 'construction-age-band': 'England and Wales: 1967-1975', 'potential-energy-rating': 'D', - 'mainheat-energy-eff': 'Average', 'windows-env-eff': 'Good', 'lighting-energy-eff': 'Average', - 'environment-impact-potential': '53', - 'glazed-type': 'double glazing installed during or after 2002', 'heating-cost-current': '1475', - 'address3': '', 'mainheatcont-description': 'Programmer, room thermostat and TRVs', - 'sheating-energy-eff': 'N/A', 'property-type': 'House', 'local-authority-label': 'Melton', - 'fixed-lighting-outlets-count': '', 'energy-tariff': 'Single', - 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '96', 'county': 'Leicestershire', - 'postcode': 'NG32 1SP', 'solar-water-heating-flag': 'Y', 'constituency': 'E14000909', - 'co2-emissions-potential': '5.7', 'number-heated-rooms': '7', - 'floor-description': 'Suspended, no insulation (assumed)', - 'energy-consumption-potential': '177', 'local-authority': 'E07000133', 'built-form': 'Detached', - 'number-open-fireplaces': '1', 'windows-description': 'Fully double glazed', - 'glazed-area': 'Normal', 'inspection-date': '2016-09-22', 'mains-gas-flag': 'N', - 'co2-emiss-curr-per-floor-area': '87', 'address1': '26, Vicarage Lane', - 'heat-loss-corridor': 'NO DATA!', 'flat-storey-count': '', - 'constituency-label': 'Rutland and Melton', 'roof-energy-eff': 'Very Poor', - 'total-floor-area': '116.0', 'building-reference-number': '4940047478', - 'environment-impact-current': '29', 'co2-emissions-current': '10.0', - 'roof-description': 'Pitched, limited insulation (assumed)', 'floor-energy-eff': 'NO DATA!', - 'number-habitable-rooms': '7', 'address2': 'Eaton', 'hot-water-env-eff': 'Good', - 'posttown': 'GRANTHAM', 'mainheatc-energy-eff': 'Good', 'main-fuel': 'oil (not community)', - 'lighting-env-eff': 'Average', 'windows-energy-eff': 'Good', 'floor-env-eff': 'N/A', - 'sheating-env-eff': 'N/A', - 'lighting-description': 'Low energy lighting in 31% of fixed outlets', - 'roof-env-eff': 'Very Poor', 'walls-energy-eff': 'Poor', 'photo-supply': '', - 'lighting-cost-potential': '69', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100', - 'main-heating-controls': '2106', 'lodgement-datetime': '2016-09-23 20:29:01', - 'flat-top-storey': '', 'current-energy-rating': 'F', - 'secondheat-description': 'Room heaters, dual fuel (mineral and wood)', 'walls-env-eff': 'Poor', - 'transaction-type': 'marketed sale', 'uprn': '100030534042', 'current-energy-efficiency': '34', - 'energy-consumption-current': '343', 'mainheat-description': 'Boiler and radiators, oil', - 'lighting-cost-current': '117', 'lodgement-date': '2016-09-23', 'extension-count': '2', - 'mainheatc-env-eff': 'Good', 'lmk-key': '1481856849902016092320290148762028', - 'wind-turbine-count': '0', 'tenure': 'owner-occupied', 'floor-level': 'NODATA!', - 'potential-energy-efficiency': '64', 'hot-water-energy-eff': 'Good', - 'low-energy-lighting': '31', - 'walls-description': 'Cavity wall, as built, no insulation (assumed)', - 'hotwater-description': 'From main system, plus solar' - } - - # This is the training data as we prepare it in the engine - # This is an actual record from the training data - row = { - 'UPRN': '100030534042', 'RDSAP_CHANGE': 12, 'HEAT_DEMAND_CHANGE': -72, - 'CARBON_CHANGE': -2.0999999999999996, 'SAP_STARTING': 34, 'SAP_ENDING': 46, 'HEAT_DEMAND_STARTING': 343, - 'HEAT_DEMAND_ENDING': 271, 'CARBON_STARTING': 10.0, 'CARBON_ENDING': 7.9, 'PROPERTY_TYPE': 'House', - 'BUILT_FORM': 'Detached', 'CONSTITUENCY': 'E14000909', 'NUMBER_HABITABLE_ROOMS': 7.0, - 'NUMBER_HEATED_ROOMS': 7.0, 'FIXED_LIGHTING_OUTLETS_COUNT': 21.0, - 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1967-1975', 'TRANSACTION_TYPE_STARTING': 'marketed sale', - 'MECHANICAL_VENTILATION_STARTING': 'natural', - 'SECONDHEAT_DESCRIPTION_STARTING': 'Room heaters, dual fuel (mineral and wood)', - 'ENERGY_TARIFF_STARTING': 'Single', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'Y', - 'PHOTO_SUPPLY_STARTING': 0.0, 'GLAZED_TYPE_STARTING': 'double glazing installed during or after 2002', - 'MULTI_GLAZE_PROPORTION_STARTING': 100.0, 'LOW_ENERGY_LIGHTING_STARTING': 31.0, - 'NUMBER_OPEN_FIREPLACES_STARTING': 1.0, 'EXTENSION_COUNT_STARTING': 2.0, - 'TOTAL_FLOOR_AREA_STARTING': 116.0, 'FLOOR_HEIGHT_STARTING': 2.39, - 'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'natural', - 'SECONDHEAT_DESCRIPTION_ENDING': 'Room heaters, dual fuel (mineral and wood)', - 'ENERGY_TARIFF_ENDING': 'Single', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'Y', 'PHOTO_SUPPLY_ENDING': 0.0, - 'GLAZED_TYPE_ENDING': 'double glazing installed during or after 2002', - 'MULTI_GLAZE_PROPORTION_ENDING': 100.0, 'LOW_ENERGY_LIGHTING_ENDING': 31.0, - 'NUMBER_OPEN_FIREPLACES_ENDING': 1.0, 'EXTENSION_COUNT_ENDING': 2.0, 'TOTAL_FLOOR_AREA_ENDING': 116.0, - 'FLOOR_HEIGHT_ENDING': 2.41, 'DAYS_TO_STARTING': 784, 'DAYS_TO_ENDING': 867, - 'walls_thermal_transmittance': 1.5, 'is_cavity_wall': True, 'is_filled_cavity': False, - 'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False, - 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, - 'is_sandstone_or_limestone': False, 'is_park_home': False, 'walls_insulation_thickness': 'none', - 'external_insulation': False, 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 0.7, - 'is_park_home_ENDING': False, 'walls_insulation_thickness_ENDING': 'average', - 'external_insulation_ENDING': False, 'internal_insulation_ENDING': False, - 'floor_thermal_transmittance': 0.52, 'is_to_unheated_space': False, 'is_to_external_air': False, - 'is_suspended': True, 'is_solid': False, 'another_property_below': False, - 'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.52, - 'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 1.5, 'is_pitched': True, - 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, - 'has_dwelling_above': False, 'roof_insulation_thickness': 'below average', - 'roof_thermal_transmittance_ENDING': 1.5, 'roof_insulation_thickness_ENDING': 'below average', - 'heater_type': 'Unknown', 'system_type': 'from main system', 'thermostat_characteristics': 'Unknown', - 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', 'hotwater_tariff_type': 'Unknown', - 'extra_features': 'plus solar', 'chp_systems': 'Unknown', 'distribution_system': 'Unknown', - 'no_system_present': 'Unknown', 'appliance': 'Unknown', 'heater_type_ENDING': 'Unknown', - 'system_type_ENDING': 'from main system', 'thermostat_characteristics_ENDING': 'Unknown', - 'heating_scope_ENDING': 'Unknown', 'energy_recovery_ENDING': 'Unknown', - 'hotwater_tariff_type_ENDING': 'Unknown', 'extra_features_ENDING': 'plus solar', - 'chp_systems_ENDING': 'Unknown', 'distribution_system_ENDING': 'Unknown', - 'no_system_present_ENDING': 'Unknown', 'appliance_ENDING': 'Unknown', 'has_radiators': True, - 'has_fan_coil_units': False, 'has_pipes_in_screed_above_insulation': False, - 'has_pipes_in_insulated_timber_floor': False, 'has_pipes_in_concrete_slab': False, 'has_boiler': True, - 'has_air_source_heat_pump': False, 'has_room_heaters': False, 'has_electric_storage_heaters': False, - 'has_warm_air': False, 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False, - 'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False, - 'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False, - 'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False, - 'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False, - 'has_mains_gas': False, 'has_wood_logs': False, 'has_coal': False, 'has_oil': True, - 'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False, - 'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False, - 'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True, - 'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False, - 'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False, - 'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False, - 'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False, - 'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False, - 'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False, - 'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False, - 'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False, - 'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False, - 'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False, - 'has_electric_ENDING': False, 'has_mains_gas_ENDING': False, 'has_wood_logs_ENDING': False, - 'has_coal_ENDING': False, 'has_oil_ENDING': True, 'has_wood_pellets_ENDING': False, - 'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False, - 'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False, - 'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False, - 'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'room thermostat', - 'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown', - 'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False, - 'auxiliary_systems': 'Unknown', 'trvs': 'trvs', 'rate_control': 'Unknown', - 'thermostatic_control_ENDING': 'room thermostat', 'charging_system_ENDING': 'Unknown', - 'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown', - 'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False, - 'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'trvs', 'rate_control_ENDING': 'Unknown', - 'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'oil', - 'main-fuel_tariff_type': 'Unknown', 'is_community': False, - 'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown', - 'fuel_type_ENDING': 'oil', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False, - 'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown', - 'estimated_perimeter_STARTING': 30.531014675946444, 'estimated_perimeter_ENDING': 30.531014675946444, - 'HOT_WATER_ENERGY_EFF_STARTING': "Good", - "FLOOR_ENERGY_EFF_STARTING": "Unknown", - "WINDOWS_ENERGY_EFF_STARTING": "Good", - "WALLS_ENERGY_EFF_STARTING": "Poor", - "SHEATING_ENERGY_EFF_STARTING": "Unknown", - "ROOF_ENERGY_EFF_STARTING": "Very Poor", - "MAINHEAT_ENERGY_EFF_STARTING": "Average", - "MAINHEATC_ENERGY_EFF_STARTING": "Good", - "LIGHTING_ENERGY_EFF_STARTING": "Average", - "POTENTIAL_ENERGY_EFFICIENCY": 64, - "ENVIRONMENT_IMPACT_POTENTIAL": 53, - "ENERGY_CONSUMPTION_POTENTIAL": 177.0, - "CO2_EMISSIONS_POTENTIAL": 5.7, - "HOT_WATER_ENERGY_EFF_ENDING": "Good", - "FLOOR_ENERGY_EFF_ENDING": "Unknown", - "WINDOWS_ENERGY_EFF_ENDING": "Good", - "WALLS_ENERGY_EFF_ENDING": "Good", - "SHEATING_ENERGY_EFF_ENDING": "Unknown", - "ROOF_ENERGY_EFF_ENDING": "Very Poor", - "MAINHEAT_ENERGY_EFF_ENDING": "Average", - "MAINHEATC_ENERGY_EFF_ENDING": "Good", - "LIGHTING_ENERGY_EFF_ENDING": "Average", - } - - home = Property( - id=0, - postcode=starting_epc["postcode"], - address=starting_epc["address1"], - data=starting_epc - ) - home.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) - - data_processor = DataProcessor(None, newdata=True) - data_processor.insert_data(pd.DataFrame([home.get_model_data()])) - - data_processor.pre_process() - - starting_epc_data = data_processor.get_component_features(suffix="_STARTING") - ending_epc_data = data_processor.get_component_features(suffix="_ENDING") - fixed_data = data_processor.get_fixed_features() - - ending_lodgement_date = '2016-12-15' - - ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(ending_lodgement_date) - - recommendation = { - "recommendation_id": 0, - "new_u_value": 0.7, - "type": "cavity_wall_insulation" - } - - test_record = create_recommendation_scoring_data( - property=home, - recommendation=recommendation, - starting_epc_data=starting_epc_data, - ending_epc_data=ending_epc_data, - fixed_data=fixed_data, - ) - test_record = pd.DataFrame([test_record]) - - # Test the final cleaning: - test_record = DataProcessor.apply_averages_cleaning( - data_to_clean=test_record, - cleaning_data=cleaning_data, - cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"] - ).drop(columns=["LOCAL_AUTHORITY"]) - - test_record = DataProcessor.clean_missings_after_description_process( - test_record, [ - c for c in test_record.columns if - ("thermal_transmittance" in c) or ("insulation_thickness" in c) - ] - ) - - # Test that the data has been set up correctly - - # Things to fix: - # [] Filled cavity should have an average insulation thickness in the cleaned data - - for c in test_record.columns: - if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]: - continue - - if c == "FLOOR_HEIGHT_ENDING": - assert (row[c] - test_record[c].values[0]) <= 0.020001 - continue - - if c == "walls_insulation_thickness_ENDING": - assert row[c] == "average" - assert test_record[c].values[0] == "above average" - continue - - assert test_record[c].values[0] == row[c] - - def test_internal_wall_insulation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): - - starting_epc2 = { - 'low-energy-fixed-light-count': '2', 'address': 'FLAT 12, WAREHOUSE W, 3 WESTERN GATEWAY', - 'uprn-source': 'Energy Assessor', 'floor-height': '3.64', 'heating-cost-potential': '465', - 'unheated-corridor-length': '', 'hot-water-cost-potential': '185', - 'construction-age-band': 'England and Wales: 1900-1929', 'potential-energy-rating': 'C', - 'mainheat-energy-eff': 'Very Poor', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Poor', - 'environment-impact-potential': '51', 'glazed-type': 'double glazing installed during or after 2002', - 'heating-cost-current': '1223', 'address3': '3 WESTERN GATEWAY', - 'mainheatcont-description': 'Programmer and appliance thermostats', 'sheating-energy-eff': 'N/A', - 'property-type': 'Flat', 'local-authority-label': 'Newham', 'fixed-lighting-outlets-count': '12', - 'energy-tariff': 'off-peak 7 hour', 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '342', - 'county': '', 'postcode': 'E16 1BD', 'solar-water-heating-flag': 'N', 'constituency': 'E14001032', - 'co2-emissions-potential': '3.6', 'number-heated-rooms': '2', 'floor-description': '(other premises below)', - 'energy-consumption-potential': '307', 'local-authority': 'E09000025', 'built-form': 'Mid-Terrace', - 'number-open-fireplaces': '0', 'windows-description': 'Partial double glazing', 'glazed-area': 'Normal', - 'inspection-date': '2020-10-14', 'mains-gas-flag': 'N', 'co2-emiss-curr-per-floor-area': '66', - 'address1': 'FLAT 12', 'heat-loss-corridor': 'heated corridor', 'flat-storey-count': '', - 'constituency-label': 'West Ham', 'roof-energy-eff': 'N/A', 'total-floor-area': '70.0', - 'building-reference-number': '10000539740', 'environment-impact-current': '42', - 'co2-emissions-current': '4.6', 'roof-description': '(another dwelling above)', 'floor-energy-eff': 'N/A', - 'number-habitable-rooms': '2', 'address2': 'WAREHOUSE W', 'hot-water-env-eff': 'Poor', 'posttown': 'LONDON', - 'mainheatc-energy-eff': 'Good', 'main-fuel': 'electricity (not community)', 'lighting-env-eff': 'Poor', - 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', 'sheating-env-eff': 'N/A', - 'lighting-description': 'Low energy lighting in 17% of fixed outlets', 'roof-env-eff': 'N/A', - 'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0', 'lighting-cost-potential': '67', - 'mainheat-env-eff': 'Poor', 'multi-glaze-proportion': '61', 'main-heating-controls': '', - 'lodgement-datetime': '2020-10-14 00:00:00', 'flat-top-storey': 'N', 'current-energy-rating': 'F', - 'secondheat-description': 'None', 'walls-env-eff': 'Very Poor', 'transaction-type': 'marketed sale', - 'uprn': '10012839482', 'current-energy-efficiency': '33', 'energy-consumption-current': '393', - 'mainheat-description': 'Room heaters, electric', 'lighting-cost-current': '110', - 'lodgement-date': '2020-10-14', 'extension-count': '0', 'mainheatc-env-eff': 'Good', - 'lmk-key': 'b0d82f468273bec55ec5676a809b8e36b55db940ffa92f482a482f6aaa38eb1d', 'wind-turbine-count': '0', - 'tenure': 'Owner-occupied', 'floor-level': '01', 'potential-energy-efficiency': '71', - 'hot-water-energy-eff': 'Very Poor', 'low-energy-lighting': '17', - 'walls-description': 'Solid brick, as built, no insulation (assumed)', - 'hotwater-description': 'Electric immersion, standard tariff' - } - - row2 = { - 'UPRN': '10012839482', 'RDSAP_CHANGE': 8, 'HEAT_DEMAND_CHANGE': -59, - 'CARBON_CHANGE': -0.5999999999999996, 'SAP_STARTING': 33, 'SAP_ENDING': 41, 'HEAT_DEMAND_STARTING': 393, - 'HEAT_DEMAND_ENDING': 334, 'CARBON_STARTING': 4.6, 'CARBON_ENDING': 4.0, 'PROPERTY_TYPE': 'Flat', - 'BUILT_FORM': 'Mid-Terrace', 'CONSTITUENCY': 'E14001032', 'NUMBER_HABITABLE_ROOMS': 2.0, - 'NUMBER_HEATED_ROOMS': 2.0, 'FIXED_LIGHTING_OUTLETS_COUNT': 12.0, - 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1996-2002', 'TRANSACTION_TYPE_STARTING': 'marketed sale', - 'MECHANICAL_VENTILATION_STARTING': 'natural', 'SECONDHEAT_DESCRIPTION_STARTING': 'None', - 'ENERGY_TARIFF_STARTING': 'off-peak 7 hour', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'N', - 'PHOTO_SUPPLY_STARTING': 0.0, 'GLAZED_TYPE_STARTING': 'double glazing installed during or after 2002', - 'MULTI_GLAZE_PROPORTION_STARTING': 61.0, 'LOW_ENERGY_LIGHTING_STARTING': 17.0, - 'NUMBER_OPEN_FIREPLACES_STARTING': 0.0, 'EXTENSION_COUNT_STARTING': 0.0, - 'TOTAL_FLOOR_AREA_STARTING': 70.0, 'FLOOR_HEIGHT_STARTING': 3.64, - 'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'natural', - 'SECONDHEAT_DESCRIPTION_ENDING': 'None', 'ENERGY_TARIFF_ENDING': 'off-peak 7 hour', - 'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 0.0, - 'GLAZED_TYPE_ENDING': 'double glazing installed during or after 2002', - 'MULTI_GLAZE_PROPORTION_ENDING': 61.0, 'LOW_ENERGY_LIGHTING_ENDING': 17.0, - 'NUMBER_OPEN_FIREPLACES_ENDING': 0.0, 'EXTENSION_COUNT_ENDING': 0.0, 'TOTAL_FLOOR_AREA_ENDING': 70.0, - 'FLOOR_HEIGHT_ENDING': 3.64, 'DAYS_TO_STARTING': 2266, 'DAYS_TO_ENDING': 2307, - 'walls_thermal_transmittance': 1.7, 'is_cavity_wall': False, 'is_filled_cavity': False, - 'is_solid_brick': True, 'is_system_built': False, 'is_timber_frame': False, - 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, - 'is_sandstone_or_limestone': False, 'is_park_home': False, 'walls_insulation_thickness': 'none', - 'external_insulation': False, 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 0.21, - 'is_park_home_ENDING': False, 'walls_insulation_thickness_ENDING': 'average', - 'external_insulation_ENDING': False, 'internal_insulation_ENDING': False, - 'floor_thermal_transmittance': 0.0, 'is_to_unheated_space': False, 'is_to_external_air': False, - 'is_suspended': False, 'is_solid': False, 'another_property_below': True, - 'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.0, - 'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 0.0, 'is_pitched': False, - 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, - 'has_dwelling_above': True, 'roof_insulation_thickness': 'none', - 'roof_thermal_transmittance_ENDING': 0.0, 'roof_insulation_thickness_ENDING': 'none', - 'heater_type': 'electric immersion', 'system_type': 'Unknown', 'thermostat_characteristics': 'Unknown', - 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', 'hotwater_tariff_type': 'standard tariff', - 'extra_features': 'Unknown', 'chp_systems': 'Unknown', 'distribution_system': 'Unknown', - 'no_system_present': 'Unknown', 'appliance': 'Unknown', 'heater_type_ENDING': 'electric immersion', - 'system_type_ENDING': 'Unknown', 'thermostat_characteristics_ENDING': 'Unknown', - 'heating_scope_ENDING': 'Unknown', 'energy_recovery_ENDING': 'Unknown', - 'hotwater_tariff_type_ENDING': 'standard tariff', 'extra_features_ENDING': 'Unknown', - 'chp_systems_ENDING': 'Unknown', 'distribution_system_ENDING': 'Unknown', - 'no_system_present_ENDING': 'Unknown', 'appliance_ENDING': 'Unknown', 'has_radiators': False, - 'has_fan_coil_units': False, 'has_pipes_in_screed_above_insulation': False, - 'has_pipes_in_insulated_timber_floor': False, 'has_pipes_in_concrete_slab': False, 'has_boiler': False, - 'has_air_source_heat_pump': False, 'has_room_heaters': True, 'has_electric_storage_heaters': False, - 'has_warm_air': False, 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False, - 'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False, - 'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False, - 'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False, - 'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': True, - 'has_mains_gas': False, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False, - 'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False, - 'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False, - 'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': False, - 'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False, - 'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False, - 'has_boiler_ENDING': False, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': True, - 'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False, - 'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False, - 'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False, - 'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False, - 'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False, - 'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False, - 'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False, - 'has_electric_ENDING': True, 'has_mains_gas_ENDING': False, 'has_wood_logs_ENDING': False, - 'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False, - 'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False, - 'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False, - 'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False, - 'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'appliance thermostats', - 'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown', - 'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False, - 'auxiliary_systems': 'Unknown', 'trvs': 'Unknown', 'rate_control': 'Unknown', - 'thermostatic_control_ENDING': 'appliance thermostats', 'charging_system_ENDING': 'Unknown', - 'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown', - 'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False, - 'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'Unknown', 'rate_control_ENDING': 'Unknown', - 'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'electricity', - 'main-fuel_tariff_type': 'Unknown', 'is_community': False, - 'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown', - 'fuel_type_ENDING': 'electricity', 'main-fuel_tariff_type_ENDING': 'Unknown', - 'is_community_ENDING': False, 'no_individual_heating_or_community_network_ENDING': False, - 'complex_fuel_type_ENDING': 'Unknown', 'estimated_perimeter_STARTING': 35.4964786985977, - 'estimated_perimeter_ENDING': 35.4964786985977, - 'HOT_WATER_ENERGY_EFF_STARTING': "Very Poor", - "FLOOR_ENERGY_EFF_STARTING": "Unknown", - "WINDOWS_ENERGY_EFF_STARTING": "Average", - "WALLS_ENERGY_EFF_STARTING": "Very Poor", - "SHEATING_ENERGY_EFF_STARTING": "Unknown", - "ROOF_ENERGY_EFF_STARTING": "Unknown", - "MAINHEAT_ENERGY_EFF_STARTING": "Very Poor", - "MAINHEATC_ENERGY_EFF_STARTING": "Good", - "LIGHTING_ENERGY_EFF_STARTING": "Poor", - "POTENTIAL_ENERGY_EFFICIENCY": 71, - "ENVIRONMENT_IMPACT_POTENTIAL": 51, - "ENERGY_CONSUMPTION_POTENTIAL": 307, - "CO2_EMISSIONS_POTENTIAL": 3.6, - 'HOT_WATER_ENERGY_EFF_ENDING': "Very Poor", - "FLOOR_ENERGY_EFF_ENDING": "Unknown", - "WINDOWS_ENERGY_EFF_ENDING": "Average", - "WALLS_ENERGY_EFF_ENDING": "Good", - "SHEATING_ENERGY_EFF_ENDING": "Unknown", - "ROOF_ENERGY_EFF_ENDING": "Unknown", - "MAINHEAT_ENERGY_EFF_ENDING": "Very Poor", - "MAINHEATC_ENERGY_EFF_ENDING": "Good", - "LIGHTING_ENERGY_EFF_ENDING": "Poor", - } - - home2 = Property( - id=0, - postcode=starting_epc2["postcode"], - address=starting_epc2["address1"], - data=starting_epc2 - ) - home2.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) - home2.set_number_lighting_outlets(None) - - data_processor2 = DataProcessor(None, newdata=True) - data_processor2.insert_data(pd.DataFrame([home2.get_model_data()])) - - data_processor2.pre_process() - - starting_epc_data2 = data_processor2.get_component_features(suffix="_STARTING") - ending_epc_data2 = data_processor2.get_component_features(suffix="_ENDING") - fixed_data2 = data_processor2.get_fixed_features() - - ending_lodgement_date2 = '2020-11-24' - - ending_epc_data2["DAYS_TO_ENDING"] = data_processor2.calculate_days_to(ending_lodgement_date2) - - recommendation2 = { - "recommendation_id": 0, - "new_u_value": 0.21, - "type": "internal_wall_insulation" - } - - test_record2 = create_recommendation_scoring_data( - property=home2, - recommendation=recommendation2, - starting_epc_data=starting_epc_data2, - ending_epc_data=ending_epc_data2, - fixed_data=fixed_data2, - ) - test_record2 = pd.DataFrame([test_record2]) - - # Test the final cleaning: - test_record2 = DataProcessor.apply_averages_cleaning( - data_to_clean=test_record2, - cleaning_data=cleaning_data, - cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"] - ).drop(columns=["LOCAL_AUTHORITY"]) - - test_record2 = DataProcessor.clean_missings_after_description_process( - test_record2, [ - c for c in test_record2.columns if - ("thermal_transmittance" in c) or ("insulation_thickness" in c) - ] - ) - - for c in test_record2.columns: - if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]: - continue - - if c == "FLOOR_HEIGHT_ENDING": - assert (row2[c] - test_record2[c].values[0]) <= 0.020001 - continue - - if c == "walls_insulation_thickness_ENDING": - assert row2[c] == "average" - assert test_record2[c].values[0] == "above average" - continue - - if c == "CONSTRUCTION_AGE_BAND": - # For this, we have different values in the original data - assert row2[c] == "England and Wales: 1996-2002" - assert test_record2[c].values[0] == "England and Wales: 1900-1929" - continue - - assert test_record2[c].values[0] == row2[c] - - def test_ventilation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): - - starting_epc3 = { - 'low-energy-fixed-light-count': '', 'address': '45 Shepperson Road', 'uprn-source': 'Energy Assessor', - 'floor-height': '1.87', 'heating-cost-potential': '645', 'unheated-corridor-length': '', - 'hot-water-cost-potential': '69', 'construction-age-band': 'England and Wales: 1900-1929', - 'potential-energy-rating': 'C', 'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average', - 'lighting-energy-eff': 'Average', 'environment-impact-potential': '75', - 'glazed-type': 'double glazing, unknown install date', 'heating-cost-current': '1028', 'address3': '', - 'mainheatcont-description': 'Programmer, TRVs and bypass', 'sheating-energy-eff': 'N/A', - 'property-type': 'House', 'local-authority-label': 'Sheffield', 'fixed-lighting-outlets-count': '21', - 'energy-tariff': 'Single', 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '96', - 'county': '', 'postcode': 'S6 4FG', 'solar-water-heating-flag': 'N', 'constituency': 'E14000921', - 'co2-emissions-potential': '2.9', 'number-heated-rooms': '5', - 'floor-description': 'Suspended, no insulation (assumed)', 'energy-consumption-potential': '152', - 'local-authority': 'E08000019', 'built-form': 'Enclosed Mid-Terrace', 'number-open-fireplaces': '0', - 'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', 'inspection-date': '2022-06-13', - 'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '59', 'address1': '45 Shepperson Road', - 'heat-loss-corridor': '', 'flat-storey-count': '', - 'constituency-label': 'Sheffield, Brightside and Hillsborough', 'roof-energy-eff': 'Very Poor', - 'total-floor-area': '107.0', 'building-reference-number': '10002892085', 'environment-impact-current': '46', - 'co2-emissions-current': '6.3', 'roof-description': 'Pitched, no insulation (assumed)', - 'floor-energy-eff': 'N/A', 'number-habitable-rooms': '5', 'address2': '', 'hot-water-env-eff': 'Good', - 'posttown': 'SHEFFIELD', 'mainheatc-energy-eff': 'Average', 'main-fuel': 'mains gas (not community)', - 'lighting-env-eff': 'Average', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', - 'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 43% of fixed outlets', - 'roof-env-eff': 'Very Poor', 'walls-energy-eff': 'Very Poor', 'photo-supply': '0.0', - 'lighting-cost-potential': '83', 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100', - 'main-heating-controls': '', 'lodgement-datetime': '2023-05-27 12:15:21', 'flat-top-storey': '', - 'current-energy-rating': 'E', 'secondheat-description': 'None', 'walls-env-eff': 'Very Poor', - 'transaction-type': 'marketed sale', 'uprn': '100051073214', 'current-energy-efficiency': '54', - 'energy-consumption-current': '335', 'mainheat-description': 'Boiler and radiators, mains gas', - 'lighting-cost-current': '131', 'lodgement-date': '2023-05-27', 'extension-count': '1', - 'mainheatc-env-eff': 'Average', - 'lmk-key': 'dc1a4da246562656132b8e36e0534cd90b09fa40fc584e25e644e2d9ab86a247', 'wind-turbine-count': '0', - 'tenure': 'Not defined - use in the case of a new dwelling for which the intended tenure in not known. It ' - 'is not to be used for an existing dwelling', - 'floor-level': '', 'potential-energy-efficiency': '80', 'hot-water-energy-eff': 'Good', - 'low-energy-lighting': '43', - 'walls-description': 'Sandstone or limestone, as built, no insulation (assumed)', - 'hotwater-description': 'From main system' - } - - row3 = { - 'UPRN': '100051073214', 'RDSAP_CHANGE': 2, 'HEAT_DEMAND_CHANGE': -22, 'CARBON_CHANGE': -0.39999999999999947, - 'SAP_STARTING': 54, 'SAP_ENDING': 56, 'HEAT_DEMAND_STARTING': 335, 'HEAT_DEMAND_ENDING': 313, - 'CARBON_STARTING': 6.3, 'CARBON_ENDING': 5.9, 'PROPERTY_TYPE': 'House', 'BUILT_FORM': 'Mid-Terrace', - 'CONSTITUENCY': 'E14000921', 'NUMBER_HABITABLE_ROOMS': 5.0, 'NUMBER_HEATED_ROOMS': 5.0, - 'FIXED_LIGHTING_OUTLETS_COUNT': 21.0, 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1900-1929', - 'TRANSACTION_TYPE_STARTING': 'marketed sale', 'MECHANICAL_VENTILATION_STARTING': 'natural', - 'SECONDHEAT_DESCRIPTION_STARTING': 'None', 'ENERGY_TARIFF_STARTING': 'Single', - 'SOLAR_WATER_HEATING_FLAG_STARTING': 'N', 'PHOTO_SUPPLY_STARTING': 0.0, - 'GLAZED_TYPE_STARTING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_STARTING': 100.0, - 'LOW_ENERGY_LIGHTING_STARTING': 43.0, 'NUMBER_OPEN_FIREPLACES_STARTING': 0.0, - 'EXTENSION_COUNT_STARTING': 1.0, 'TOTAL_FLOOR_AREA_STARTING': 107.0, 'FLOOR_HEIGHT_STARTING': 1.87, - 'TRANSACTION_TYPE_ENDING': 'marketed sale', 'MECHANICAL_VENTILATION_ENDING': 'mechanical, extract only', - 'SECONDHEAT_DESCRIPTION_ENDING': 'None', 'ENERGY_TARIFF_ENDING': 'Single', - 'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 0.0, - 'GLAZED_TYPE_ENDING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_ENDING': 100.0, - 'LOW_ENERGY_LIGHTING_ENDING': 43.0, 'NUMBER_OPEN_FIREPLACES_ENDING': 0.0, 'EXTENSION_COUNT_ENDING': 1.0, - 'TOTAL_FLOOR_AREA_ENDING': 107.0, 'FLOOR_HEIGHT_ENDING': 1.87, 'DAYS_TO_STARTING': 3221, - 'DAYS_TO_ENDING': 2874, 'walls_thermal_transmittance': 2.0, 'is_cavity_wall': False, - 'is_filled_cavity': False, 'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False, - 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_sandstone_or_limestone': True, - 'is_park_home': False, 'walls_insulation_thickness': 'none', 'external_insulation': False, - 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 2.0, 'is_park_home_ENDING': False, - 'walls_insulation_thickness_ENDING': 'none', 'external_insulation_ENDING': False, - 'internal_insulation_ENDING': False, 'floor_thermal_transmittance': 0.51, 'is_to_unheated_space': False, - 'is_to_external_air': False, 'is_suspended': True, 'is_solid': False, 'another_property_below': False, - 'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.51, - 'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 2.3, 'is_pitched': True, - 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, - 'has_dwelling_above': False, 'roof_insulation_thickness': 'none', 'roof_thermal_transmittance_ENDING': 2.3, - 'roof_insulation_thickness_ENDING': 'none', 'heater_type': 'Unknown', 'system_type': 'from main system', - 'thermostat_characteristics': 'Unknown', 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', - 'hotwater_tariff_type': 'Unknown', 'extra_features': 'Unknown', 'chp_systems': 'Unknown', - 'distribution_system': 'Unknown', 'no_system_present': 'Unknown', 'appliance': 'Unknown', - 'heater_type_ENDING': 'Unknown', 'system_type_ENDING': 'from main system', - 'thermostat_characteristics_ENDING': 'Unknown', 'heating_scope_ENDING': 'Unknown', - 'energy_recovery_ENDING': 'Unknown', 'hotwater_tariff_type_ENDING': 'Unknown', - 'extra_features_ENDING': 'Unknown', 'chp_systems_ENDING': 'Unknown', - 'distribution_system_ENDING': 'Unknown', 'no_system_present_ENDING': 'Unknown', - 'appliance_ENDING': 'Unknown', 'has_radiators': True, 'has_fan_coil_units': False, - 'has_pipes_in_screed_above_insulation': False, 'has_pipes_in_insulated_timber_floor': False, - 'has_pipes_in_concrete_slab': False, 'has_boiler': True, 'has_air_source_heat_pump': False, - 'has_room_heaters': False, 'has_electric_storage_heaters': False, 'has_warm_air': False, - 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False, - 'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False, - 'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False, - 'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False, - 'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False, - 'has_mains_gas': True, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False, - 'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False, - 'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False, - 'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True, - 'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False, - 'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False, - 'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False, - 'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False, - 'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False, - 'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False, - 'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False, - 'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False, - 'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False, - 'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False, - 'has_electric_ENDING': False, 'has_mains_gas_ENDING': True, 'has_wood_logs_ENDING': False, - 'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False, - 'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False, - 'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False, - 'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False, - 'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'Unknown', 'charging_system': 'Unknown', - 'switch_system': 'programmer', 'no_control': 'Unknown', 'dhw_control': 'Unknown', - 'community_heating': 'Unknown', 'multiple_room_thermostats': False, 'auxiliary_systems': 'bypass', - 'trvs': 'trvs', 'rate_control': 'Unknown', 'thermostatic_control_ENDING': 'Unknown', - 'charging_system_ENDING': 'Unknown', 'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', - 'dhw_control_ENDING': 'Unknown', 'community_heating_ENDING': 'Unknown', - 'multiple_room_thermostats_ENDING': False, 'auxiliary_systems_ENDING': 'bypass', 'trvs_ENDING': 'trvs', - 'rate_control_ENDING': 'Unknown', 'glazing_type': 'double', 'glazing_type_ENDING': 'double', - 'fuel_type': 'mains gas', 'main-fuel_tariff_type': 'Unknown', 'is_community': False, - 'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown', - 'fuel_type_ENDING': 'mains gas', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False, - 'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown', - 'estimated_perimeter_STARTING': 30.06908711617298, 'estimated_perimeter_ENDING': 30.06908711617298, - 'HOT_WATER_ENERGY_EFF_STARTING': "Good", - "FLOOR_ENERGY_EFF_STARTING": "Unknown", - "WINDOWS_ENERGY_EFF_STARTING": "Average", - "WALLS_ENERGY_EFF_STARTING": "Very Poor", - "SHEATING_ENERGY_EFF_STARTING": "Unknown", - "ROOF_ENERGY_EFF_STARTING": "Very Poor", - "MAINHEAT_ENERGY_EFF_STARTING": "Good", - "MAINHEATC_ENERGY_EFF_STARTING": "Average", - "LIGHTING_ENERGY_EFF_STARTING": "Average", - "POTENTIAL_ENERGY_EFFICIENCY": 80, - "ENVIRONMENT_IMPACT_POTENTIAL": 75, - "ENERGY_CONSUMPTION_POTENTIAL": 152, - "CO2_EMISSIONS_POTENTIAL": 2.9, - 'HOT_WATER_ENERGY_EFF_ENDING': "Good", - "FLOOR_ENERGY_EFF_ENDING": "Unknown", - "WINDOWS_ENERGY_EFF_ENDING": "Average", - "WALLS_ENERGY_EFF_ENDING": "Very Poor", - "SHEATING_ENERGY_EFF_ENDING": "Unknown", - "ROOF_ENERGY_EFF_ENDING": "Very Poor", - "MAINHEAT_ENERGY_EFF_ENDING": "Good", - "MAINHEATC_ENERGY_EFF_ENDING": "Average", - "LIGHTING_ENERGY_EFF_ENDING": "Average", - } - - home3 = Property( - id=0, - postcode=starting_epc3["postcode"], - address=starting_epc3["address1"], - data=starting_epc3 - ) - home3.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) - home3.set_number_lighting_outlets(None) - - data_processor3 = DataProcessor(None, newdata=True) - data_processor3.insert_data(pd.DataFrame([home3.get_model_data()])) - - data_processor3.pre_process() - - starting_epc_data3 = data_processor3.get_component_features(suffix="_STARTING") - ending_epc_data3 = data_processor3.get_component_features(suffix="_ENDING") - fixed_data3 = data_processor3.get_fixed_features() - - ending_lodgement_date3 = '2022-06-14' - - ending_epc_data3["DAYS_TO_ENDING"] = data_processor3.calculate_days_to(ending_lodgement_date3) - - recommendation3 = { - "recommendation_id": 0, - "type": "mechanical_ventilation" - } - - test_record3 = create_recommendation_scoring_data( - property=home3, - recommendation=recommendation3, - starting_epc_data=starting_epc_data3, - ending_epc_data=ending_epc_data3, - fixed_data=fixed_data3, - ) - test_record3 = pd.DataFrame([test_record3]) - - # Test the final cleaning: - test_record3 = DataProcessor.apply_averages_cleaning( - data_to_clean=test_record3, - cleaning_data=cleaning_data, - cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"] - ).drop(columns=["LOCAL_AUTHORITY"]) - - test_record3 = DataProcessor.clean_missings_after_description_process( - test_record3, [ - c for c in test_record3.columns if - ("thermal_transmittance" in c) or ("insulation_thickness" in c) - ] - ) - - for c in test_record3.columns: - if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]: - continue - - assert test_record3[c].values[0] == row3[c] - - def test_fireplaces(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): - - starting_epc4 = { - 'low-energy-fixed-light-count': '', 'address': '9 Glebe Road, Asfordby Hill', - 'uprn-source': 'Energy Assessor', 'floor-height': '2.4', 'heating-cost-potential': '501', - 'unheated-corridor-length': '', 'hot-water-cost-potential': '70', - 'construction-age-band': 'England and Wales: 1930-1949', 'potential-energy-rating': 'C', - 'mainheat-energy-eff': 'Good', 'windows-env-eff': 'Average', 'lighting-energy-eff': 'Average', - 'environment-impact-potential': '76', 'glazed-type': 'double glazing, unknown install date', - 'heating-cost-current': '723', 'address3': '', - 'mainheatcont-description': 'Programmer and room thermostat', 'sheating-energy-eff': 'N/A', - 'property-type': 'House', 'local-authority-label': 'Melton', - 'fixed-lighting-outlets-count': '14', 'energy-tariff': 'dual', - 'mechanical-ventilation': 'natural', 'hot-water-cost-current': '98', - 'county': 'Leicestershire', 'postcode': 'LE14 3QT', 'solar-water-heating-flag': 'N', - 'constituency': 'E14000909', 'co2-emissions-potential': '2.4', 'number-heated-rooms': '5', - 'floor-description': 'Solid, no insulation (assumed)', 'energy-consumption-potential': '153', - 'local-authority': 'E07000133', 'built-form': 'Semi-Detached', 'number-open-fireplaces': '1', - 'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', - 'inspection-date': '2022-06-27', 'mains-gas-flag': 'Y', 'co2-emiss-curr-per-floor-area': '46', - 'address1': '9 Glebe Road', 'heat-loss-corridor': '', 'flat-storey-count': '', - 'constituency-label': 'Rutland and Melton', 'roof-energy-eff': 'Good', - 'total-floor-area': '87.0', 'building-reference-number': '10002396876', - 'environment-impact-current': '60', 'co2-emissions-current': '4.0', - 'roof-description': 'Pitched, 200 mm loft insulation', 'floor-energy-eff': 'N/A', - 'number-habitable-rooms': '5', 'address2': 'Asfordby Hill', 'hot-water-env-eff': 'Good', - 'posttown': 'MELTON MOWBRAY', 'mainheatc-energy-eff': 'Average', - 'main-fuel': 'mains gas (not community)', 'lighting-env-eff': 'Average', - 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', 'sheating-env-eff': 'N/A', - 'lighting-description': 'Low energy lighting in 29% of fixed outlets', 'roof-env-eff': 'Good', - 'walls-energy-eff': 'Very Poor', 'photo-supply': '15.0', 'lighting-cost-potential': '79', - 'mainheat-env-eff': 'Good', 'multi-glaze-proportion': '100', 'main-heating-controls': '', - 'lodgement-datetime': '2022-06-27 15:28:18', 'flat-top-storey': '', - 'current-energy-rating': 'D', - 'secondheat-description': 'Room heaters, dual fuel (mineral and wood)', - 'walls-env-eff': 'Very Poor', 'transaction-type': 'ECO assessment', 'uprn': '100030539619', - 'current-energy-efficiency': '66', 'energy-consumption-current': '256', - 'mainheat-description': 'Boiler and radiators, mains gas', 'lighting-cost-current': '135', - 'lodgement-date': '2022-06-27', 'extension-count': '1', 'mainheatc-env-eff': 'Average', - 'lmk-key': '736b6f4803a11d9e45b49bf98f36eb8a7f357b0dd24f3e7cddef5295518e5bef', - 'wind-turbine-count': '0', 'tenure': 'Owner-occupied', 'floor-level': '', - 'potential-energy-efficiency': '78', 'hot-water-energy-eff': 'Good', - 'low-energy-lighting': '29', - 'walls-description': 'Solid brick, as built, no insulation (assumed)', - 'hotwater-description': 'From main system' - } - - row4 = { - 'UPRN': '100030539619', 'RDSAP_CHANGE': 7, 'HEAT_DEMAND_CHANGE': -41, 'CARBON_CHANGE': -0.5, - 'SAP_STARTING': 66, 'SAP_ENDING': 73, 'HEAT_DEMAND_STARTING': 256, 'HEAT_DEMAND_ENDING': 215, - 'CARBON_STARTING': 4.0, 'CARBON_ENDING': 3.5, 'PROPERTY_TYPE': 'House', 'BUILT_FORM': 'Semi-Detached', - 'CONSTITUENCY': 'E14000909', 'NUMBER_HABITABLE_ROOMS': 5.0, 'NUMBER_HEATED_ROOMS': 5.0, - 'FIXED_LIGHTING_OUTLETS_COUNT': 14.0, 'CONSTRUCTION_AGE_BAND': 'England and Wales: 1930-1949', - 'TRANSACTION_TYPE_STARTING': 'eco assessment', 'MECHANICAL_VENTILATION_STARTING': 'natural', - 'SECONDHEAT_DESCRIPTION_STARTING': 'Room heaters, dual fuel (mineral and wood)', - 'ENERGY_TARIFF_STARTING': 'dual', 'SOLAR_WATER_HEATING_FLAG_STARTING': 'N', 'PHOTO_SUPPLY_STARTING': 15.0, - 'GLAZED_TYPE_STARTING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_STARTING': 100.0, - 'LOW_ENERGY_LIGHTING_STARTING': 29.0, 'NUMBER_OPEN_FIREPLACES_STARTING': 1.0, - 'EXTENSION_COUNT_STARTING': 1.0, 'TOTAL_FLOOR_AREA_STARTING': 87.0, 'FLOOR_HEIGHT_STARTING': 2.4, - 'TRANSACTION_TYPE_ENDING': 'eco assessment', 'MECHANICAL_VENTILATION_ENDING': 'natural', - 'SECONDHEAT_DESCRIPTION_ENDING': 'Room heaters, dual fuel (mineral and wood)', - 'ENERGY_TARIFF_ENDING': 'dual', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'N', 'PHOTO_SUPPLY_ENDING': 15.0, - 'GLAZED_TYPE_ENDING': 'double glazing, unknown install date', 'MULTI_GLAZE_PROPORTION_ENDING': 100.0, - 'LOW_ENERGY_LIGHTING_ENDING': 29.0, 'NUMBER_OPEN_FIREPLACES_ENDING': 0, 'EXTENSION_COUNT_ENDING': 1.0, - 'TOTAL_FLOOR_AREA_ENDING': 87.0, 'FLOOR_HEIGHT_ENDING': 2.4, 'DAYS_TO_STARTING': 2887, - 'DAYS_TO_ENDING': 2960, 'walls_thermal_transmittance': 1.7, 'is_cavity_wall': False, - 'is_filled_cavity': False, 'is_solid_brick': True, 'is_system_built': False, 'is_timber_frame': False, - 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_sandstone_or_limestone': False, - 'is_park_home': False, 'walls_insulation_thickness': 'none', 'external_insulation': False, - 'internal_insulation': False, 'walls_thermal_transmittance_ENDING': 1.7, 'is_park_home_ENDING': False, - 'walls_insulation_thickness_ENDING': 'none', 'external_insulation_ENDING': False, - 'internal_insulation_ENDING': False, 'floor_thermal_transmittance': 0.53, 'is_to_unheated_space': False, - 'is_to_external_air': False, 'is_suspended': False, 'is_solid': True, 'another_property_below': False, - 'floor_insulation_thickness': 'none', 'floor_thermal_transmittance_ENDING': 0.53, - 'floor_insulation_thickness_ENDING': 'none', 'roof_thermal_transmittance': 0.21, 'is_pitched': True, - 'is_roof_room': False, 'is_loft': True, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, - 'has_dwelling_above': False, 'roof_insulation_thickness': '200', 'roof_thermal_transmittance_ENDING': 0.21, - 'roof_insulation_thickness_ENDING': '200', 'heater_type': 'Unknown', 'system_type': 'from main system', - 'thermostat_characteristics': 'Unknown', 'heating_scope': 'Unknown', 'energy_recovery': 'Unknown', - 'hotwater_tariff_type': 'Unknown', 'extra_features': 'Unknown', 'chp_systems': 'Unknown', - 'distribution_system': 'Unknown', 'no_system_present': 'Unknown', 'appliance': 'Unknown', - 'heater_type_ENDING': 'Unknown', 'system_type_ENDING': 'from main system', - 'thermostat_characteristics_ENDING': 'Unknown', 'heating_scope_ENDING': 'Unknown', - 'energy_recovery_ENDING': 'Unknown', 'hotwater_tariff_type_ENDING': 'Unknown', - 'extra_features_ENDING': 'Unknown', 'chp_systems_ENDING': 'Unknown', - 'distribution_system_ENDING': 'Unknown', 'no_system_present_ENDING': 'Unknown', - 'appliance_ENDING': 'Unknown', 'has_radiators': True, 'has_fan_coil_units': False, - 'has_pipes_in_screed_above_insulation': False, 'has_pipes_in_insulated_timber_floor': False, - 'has_pipes_in_concrete_slab': False, 'has_boiler': True, 'has_air_source_heat_pump': False, - 'has_room_heaters': False, 'has_electric_storage_heaters': False, 'has_warm_air': False, - 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False, - 'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False, - 'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False, - 'has_electric_heat_pump': False, 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False, - 'has_exhaust_source_heat_pump': False, 'has_community_heat_pump': False, 'has_electric': False, - 'has_mains_gas': True, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False, - 'has_wood_pellets': False, 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False, - 'has_smokeless_fuel': False, 'has_lpg': False, 'has_b30k': False, 'has_electricaire': False, - 'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, 'has_radiators_ENDING': True, - 'has_fan_coil_units_ENDING': False, 'has_pipes_in_screed_above_insulation_ENDING': False, - 'has_pipes_in_insulated_timber_floor_ENDING': False, 'has_pipes_in_concrete_slab_ENDING': False, - 'has_boiler_ENDING': True, 'has_air_source_heat_pump_ENDING': False, 'has_room_heaters_ENDING': False, - 'has_electric_storage_heaters_ENDING': False, 'has_warm_air_ENDING': False, - 'has_electric_underfloor_heating_ENDING': False, 'has_electric_ceiling_heating_ENDING': False, - 'has_community_scheme_ENDING': False, 'has_ground_source_heat_pump_ENDING': False, - 'has_no_system_present_ENDING': False, 'has_portable_electric_heaters_ENDING': False, - 'has_water_source_heat_pump_ENDING': False, 'has_electric_heat_pump_ENDING': False, - 'has_micro-cogeneration_ENDING': False, 'has_solar_assisted_heat_pump_ENDING': False, - 'has_exhaust_source_heat_pump_ENDING': False, 'has_community_heat_pump_ENDING': False, - 'has_electric_ENDING': False, 'has_mains_gas_ENDING': True, 'has_wood_logs_ENDING': False, - 'has_coal_ENDING': False, 'has_oil_ENDING': False, 'has_wood_pellets_ENDING': False, - 'has_anthracite_ENDING': False, 'has_dual_fuel_mineral_and_wood_ENDING': False, - 'has_smokeless_fuel_ENDING': False, 'has_lpg_ENDING': False, 'has_b30k_ENDING': False, - 'has_electricaire_ENDING': False, 'has_assumed_for_most_rooms_ENDING': False, - 'has_underfloor_heating_ENDING': False, 'thermostatic_control': 'room thermostat', - 'charging_system': 'Unknown', 'switch_system': 'programmer', 'no_control': 'Unknown', - 'dhw_control': 'Unknown', 'community_heating': 'Unknown', 'multiple_room_thermostats': False, - 'auxiliary_systems': 'Unknown', 'trvs': 'Unknown', 'rate_control': 'Unknown', - 'thermostatic_control_ENDING': 'room thermostat', 'charging_system_ENDING': 'Unknown', - 'switch_system_ENDING': 'programmer', 'no_control_ENDING': 'Unknown', 'dhw_control_ENDING': 'Unknown', - 'community_heating_ENDING': 'Unknown', 'multiple_room_thermostats_ENDING': False, - 'auxiliary_systems_ENDING': 'Unknown', 'trvs_ENDING': 'Unknown', 'rate_control_ENDING': 'Unknown', - 'glazing_type': 'double', 'glazing_type_ENDING': 'double', 'fuel_type': 'mains gas', - 'main-fuel_tariff_type': 'Unknown', 'is_community': False, - 'no_individual_heating_or_community_network': False, 'complex_fuel_type': 'Unknown', - 'fuel_type_ENDING': 'mains gas', 'main-fuel_tariff_type_ENDING': 'Unknown', 'is_community_ENDING': False, - 'no_individual_heating_or_community_network_ENDING': False, 'complex_fuel_type_ENDING': 'Unknown', - 'estimated_perimeter_STARTING': 27.113649698998472, 'estimated_perimeter_ENDING': 27.113649698998472, - 'HOT_WATER_ENERGY_EFF_STARTING': "Good", - "FLOOR_ENERGY_EFF_STARTING": "Unknown", - "WINDOWS_ENERGY_EFF_STARTING": "Average", - "WALLS_ENERGY_EFF_STARTING": "Very Poor", - "SHEATING_ENERGY_EFF_STARTING": "Unknown", - "ROOF_ENERGY_EFF_STARTING": "Good", - "MAINHEAT_ENERGY_EFF_STARTING": "Good", - "MAINHEATC_ENERGY_EFF_STARTING": "Average", - "LIGHTING_ENERGY_EFF_STARTING": "Average", - "POTENTIAL_ENERGY_EFFICIENCY": 78, - "ENVIRONMENT_IMPACT_POTENTIAL": 76, - "ENERGY_CONSUMPTION_POTENTIAL": 153, - "CO2_EMISSIONS_POTENTIAL": 2.4, - 'HOT_WATER_ENERGY_EFF_ENDING': "Good", - "FLOOR_ENERGY_EFF_ENDING": "Unknown", - "WINDOWS_ENERGY_EFF_ENDING": "Average", - "WALLS_ENERGY_EFF_ENDING": "Very Poor", - "SHEATING_ENERGY_EFF_ENDING": "Unknown", - "ROOF_ENERGY_EFF_ENDING": "Good", - "MAINHEAT_ENERGY_EFF_ENDING": "Good", - "MAINHEATC_ENERGY_EFF_ENDING": "Average", - "LIGHTING_ENERGY_EFF_ENDING": "Average", - } - - home4 = Property( - id=0, - postcode=starting_epc4["postcode"], - address=starting_epc4["address1"], - data=starting_epc4 - ) - home4.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) - home4.set_number_lighting_outlets(None) - - data_processor4 = DataProcessor(None, newdata=True) - data_processor4.insert_data(pd.DataFrame([home4.get_model_data()])) - - data_processor4.pre_process() - - starting_epc_data4 = data_processor4.get_component_features(suffix="_STARTING") - ending_epc_data4 = data_processor4.get_component_features(suffix="_ENDING") - fixed_data4 = data_processor4.get_fixed_features() - - ending_lodgement_date4 = '2022-09-08' - - ending_epc_data4["DAYS_TO_ENDING"] = data_processor4.calculate_days_to(ending_lodgement_date4) - - recommendation4 = { - "recommendation_id": 0, - "type": "sealing_open_fireplace" - } - - test_record4 = create_recommendation_scoring_data( - property=home4, - recommendation=recommendation4, - starting_epc_data=starting_epc_data4, - ending_epc_data=ending_epc_data4, - fixed_data=fixed_data4, - ) - test_record4 = pd.DataFrame([test_record4]) - - # Test the final cleaning: - test_record4 = DataProcessor.apply_averages_cleaning( - data_to_clean=test_record4, - cleaning_data=cleaning_data, - cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"] - ).drop(columns=["LOCAL_AUTHORITY"]) - - test_record4 = DataProcessor.clean_missings_after_description_process( - test_record4, [ - c for c in test_record4.columns if - ("thermal_transmittance" in c) or ("insulation_thickness" in c) - ] - ) - - for c in test_record4.columns: - if c in ["id", "SAP_ENDING", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]: - continue - - assert test_record4[c].values[0] == row4[c] diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py index 48ad5148..cf0361b1 100644 --- a/etl/epc/tests/test_epcrecord.py +++ b/etl/epc/tests/test_epcrecord.py @@ -109,14 +109,13 @@ class TestEpcRecord: assert record.prepared_epc["energy-consumption-current"] == 200.0 assert record.prepared_epc["co2-emissions-current"] == 5.5 - def test_clean_energy_empty_values(self, cleaning_data, epc_records_1): + def test_clean_energy_empty_values(self, cleaning_data): # We cannot have invalid values so this should raise an exception record = EPCRecord(cleaning_data=cleaning_data) record.prepared_epc = { "energy-consumption-current": "", "co2-emissions-current": "" } - record._clean_energy() with pytest.raises(ValueError): record._clean_energy() @@ -207,7 +206,7 @@ class TestEpcRecord: "mains-gas-flag": "InvalidValue" } # It should always be Y or N or an anomally value - with pytest.raises(ValueError): + with pytest.raises(KeyError): record._clean_mains_gas() record = EPCRecord(cleaning_data=cleaning_data) @@ -225,7 +224,8 @@ class TestEpcRecord: } record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is True + assert record.prepared_epc["solar-water-heating-flag"] == "Y" + assert record.solar_water_heating_flag_bool is True def test_clean_solar_hot_water_empty(self, cleaning_data): record = EPCRecord(cleaning_data=cleaning_data) @@ -234,7 +234,8 @@ class TestEpcRecord: } record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is None + assert record.prepared_epc["solar-water-heating-flag"] == "N" + assert record.solar_water_heating_flag_bool is False def test_clean_number_lighting_outlets_valid(self, cleaning_data, epc_records_1): record = EPCRecord(cleaning_data=cleaning_data, epc_records=epc_records_1) @@ -320,7 +321,8 @@ class TestEpcRecord: record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is True + assert record.prepared_epc["solar-water-heating-flag"] == "Y" + assert record.solar_water_heating_flag_bool is True record = EPCRecord(cleaning_data=cleaning_data) @@ -330,7 +332,8 @@ class TestEpcRecord: record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is False + assert record.prepared_epc["solar-water-heating-flag"] == "N" + assert record.solar_water_heating_flag_bool is False record = EPCRecord(cleaning_data=cleaning_data) @@ -340,7 +343,8 @@ class TestEpcRecord: record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is None + assert record.prepared_epc["solar-water-heating-flag"] == "N" + assert record.solar_water_heating_flag_bool is False record = EPCRecord(cleaning_data=cleaning_data) @@ -350,4 +354,5 @@ class TestEpcRecord: record._clean_solar_hot_water() - assert record.prepared_epc["solar-water-heating-flag"] is None + assert record.prepared_epc["solar-water-heating-flag"] == "N" + assert record.solar_water_heating_flag_bool is False From 4608ac89a5dd00ec04dca170a964499abd663691 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 17:42:03 +0000 Subject: [PATCH 12/48] fixed roof tests --- backend/Property.py | 22 ++++---- .../tests/test_fireplace_recommendations.py | 24 ++++---- .../tests/test_lighting_recommendations.py | 18 +++--- .../tests/test_roof_recommendations.py | 55 +++++++++++-------- 4 files changed, 68 insertions(+), 51 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index ee496552..e6ae8bbe 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -68,7 +68,7 @@ class Property(Definitions): self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None self.restricted_measures = False self.year_built = epc_record.get("year_built") - self.number_of_rooms = epc_record.prepared_epc["number_heated_rooms"] + self.number_of_rooms = epc_record.prepared_epc.get("number_heated_rooms") self.age_band = epc_record.get("age_band") self.construction_age_band = epc_record.get("construction_age_band") self.number_of_floors = epc_record.get("number_of_floors") @@ -81,7 +81,7 @@ class Property(Definitions): "co2_emissions": epc_record.get("co2_emissions_current"), } self.ventilation = { - "ventilation": epc_record.prepared_epc["mechanical_ventilation"], + "ventilation": epc_record.prepared_epc.get("mechanical_ventilation"), } self.solar_pv = { "solar_pv": epc_record.get("photo_supply"), @@ -91,29 +91,29 @@ class Property(Definitions): "solar_hot_water_boolean": epc_record.get("solar_water_heating_flag_bool"), } self.wind_turbine = { - "wind_turbine": epc_record.prepared_epc["wind_turbine_count"], + "wind_turbine": epc_record.prepared_epc.get("wind_turbine_count"), } self.number_of_open_fireplaces = { - "number_of_open_fireplaces": epc_record.prepared_epc["number_open_fireplaces"], + "number_of_open_fireplaces": epc_record.prepared_epc.get("number_open_fireplaces"), } self.number_of_extensions = { - "number_of_extensions": epc_record.prepared_epc["extension_count"], + "number_of_extensions": epc_record.prepared_epc.get("extension_count"), } self.number_of_storeys = { - "number_of_storeys": epc_record.prepared_epc["flat_storey_count"], + "number_of_storeys": epc_record.prepared_epc.get("flat_storey_count"), } self.heat_loss_corridor = { - "heat_loss_corridor": epc_record.prepared_epc["heat_loss_corridor"], - "length": epc_record.prepared_epc["unheated_corridor_length"], + "heat_loss_corridor": epc_record.prepared_epc.get("heat_loss_corridor"), + "length": epc_record.prepared_epc.get("unheated_corridor_length"), "heat_loss_corridor_boolean": epc_record.get("heat_loss_corridor_bool"), } - self.mains_gas = epc_record.prepared_epc['mains_gas_flag'] - self.floor_height = epc_record.prepared_epc['floor_height'] + self.mains_gas = epc_record.prepared_epc.get('mains_gas_flag') + self.floor_height = epc_record.prepared_epc.get('floor_height') self.insulation_wall_area = None self.floor_area = epc_record.prepared_epc.get('total_floor_area') self.pitched_roof_area = None self.insulation_floor_area = None - self.number_lighting_outlets = epc_record.prepared_epc["fixed_lighting_outlets_count"] + self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count") self.floor_level = None self.number_of_windows = None self.solar_pv_roof_area = None diff --git a/recommendations/tests/test_fireplace_recommendations.py b/recommendations/tests/test_fireplace_recommendations.py index a91d6697..f21d6bc3 100644 --- a/recommendations/tests/test_fireplace_recommendations.py +++ b/recommendations/tests/test_fireplace_recommendations.py @@ -1,16 +1,18 @@ from backend.Property import Property -from unittest.mock import Mock from recommendations.FireplaceRecommendations import FireplaceRecommendations +from etl.epc.Record import EPCRecord class TestFirepaceRecommendations: def test_no_fireplaces(self): - property_instance = Property(id=0, address="fake", postcode="fake") - property_instance.data = { - "number-open-fireplaces": 0 + epc_record = EPCRecord() + epc_record.prepared_epc = { + "number-open-fireplaces": 0, } + property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) + recommender = FireplaceRecommendations( property_instance=property_instance ) @@ -22,10 +24,11 @@ class TestFirepaceRecommendations: assert recommender.recommendation is None def test_one_fireplace(self): - property_instance = Property(id=0, address="fake", postcode="fake") - property_instance.data = { - "number-open-fireplaces": 1 + epc_record = EPCRecord() + epc_record.prepared_epc = { + "number-open-fireplaces": 1, } + property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) recommender = FireplaceRecommendations( property_instance=property_instance @@ -40,10 +43,11 @@ class TestFirepaceRecommendations: assert recommender.recommendation[0]["total"] == 300 def test_multiple_fireplaces(self): - property_instance = Property(id=0, address="fake", postcode="fake") - property_instance.data = { - "number-open-fireplaces": 3 + epc_record = EPCRecord() + epc_record.prepared_epc = { + "number-open-fireplaces": 3, } + property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) recommender = FireplaceRecommendations( property_instance=property_instance diff --git a/recommendations/tests/test_lighting_recommendations.py b/recommendations/tests/test_lighting_recommendations.py index 964f1da0..45213d70 100644 --- a/recommendations/tests/test_lighting_recommendations.py +++ b/recommendations/tests/test_lighting_recommendations.py @@ -1,5 +1,5 @@ import pytest -from unittest.mock import Mock +from etl.epc.Record import EPCRecord from backend.Property import Property from recommendations.LightingRecommendations import LightingRecommendations @@ -9,18 +9,20 @@ from recommendations.tests.test_data.materials import materials class TestLightingRecommendations: def test_init_invalid_materials(self): - input_property0 = Property(id=1, postcode="F4k3 6", address="623 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Greater London Authority"} + input_property0 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) input_property0.lighting = {"low_energy_proportion": 0} - input_property0.data = {"county": "Greater London Authority"} # Test for invalid materials with pytest.raises(ValueError): LightingRecommendations(input_property0, []) def test_recommend_no_action_needed(self): # Case where no recommendation is needed - input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Greater London Authority"} + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) input_property1.lighting = {"low_energy_proportion": 100} - input_property1.data = {"county": "Greater London Authority"} lr = LightingRecommendations(input_property1, materials) lr.recommend() @@ -28,9 +30,9 @@ class TestLightingRecommendations: def test_recommend_action_needed(self): # Case where recommendation is needed - input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property1.lighting = {"low_energy_proportion": 100} - input_property1.data = {"county": "Greater London Authority"} + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Greater London Authority"} + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) input_property1.lighting = {"low_energy_proportion": 0.80} input_property1.number_lighting_outlets = 20 diff --git a/recommendations/tests/test_roof_recommendations.py b/recommendations/tests/test_roof_recommendations.py index 75b7ddb2..3d555a4f 100644 --- a/recommendations/tests/test_roof_recommendations.py +++ b/recommendations/tests/test_roof_recommendations.py @@ -1,12 +1,17 @@ from backend.Property import Property from recommendations.RoofRecommendations import RoofRecommendations from recommendations.tests.test_data.materials import materials +from etl.epc.Record import EPCRecord class TestRoofRecommendations: def test_loft_insulation_recommendation_no_insulation(self): - property_instance = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Cambridgeshire", + } + property_instance = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance.age_band = "F" property_instance.insulation_floor_area = 100 property_instance.roof = { @@ -18,9 +23,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance.data = { - "county": "Cambridgeshire", - } roof_recommender = RoofRecommendations(property_instance=property_instance, materials=materials) @@ -31,7 +33,9 @@ class TestRoofRecommendations: assert len(roof_recommender.recommendations) def test_loft_insulation_recommendation_50mm_insulation(self): - property_instance2 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Kent"} + property_instance2 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance2.age_band = "F" property_instance2.insulation_floor_area = 100 property_instance2.roof = { @@ -43,7 +47,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '50', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance2.data = {"county": "Kent"} roof_recommender2 = RoofRecommendations(property_instance=property_instance2, materials=materials) @@ -57,7 +60,9 @@ class TestRoofRecommendations: assert roof_recommender2.recommendations[0]["new_u_value"] == 0.14 assert roof_recommender2.recommendations[0]["starting_u_value"] == 0.68 - property_instance3 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Greater London Authority"} + property_instance3 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance3.age_band = "F" property_instance3.insulation_floor_area = 100 property_instance3.roof = { @@ -69,7 +74,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '50', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance3.data = {"county": "Greater London Authority"} roof_recommender3 = RoofRecommendations(property_instance=property_instance3, materials=materials) @@ -82,7 +86,9 @@ class TestRoofRecommendations: assert roof_recommender3.recommendations[0]["parts"][0]["depth"] == 270 def test_loft_insulation_recommendation_150mm_insulation(self): - property_instance4 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "North East Lincolnshire"} + property_instance4 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance4.age_band = "F" property_instance4.insulation_floor_area = 100 property_instance4.roof = { @@ -94,7 +100,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '150', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance4.data = {"county": "North East Lincolnshire"} roof_recommender4 = RoofRecommendations(property_instance=property_instance4, materials=materials) @@ -109,7 +114,9 @@ class TestRoofRecommendations: assert roof_recommender4.recommendations[0]["starting_u_value"] == 0.3 assert roof_recommender4.recommendations[0]["parts"][0]["depth"] == 150 - property_instance5 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Somerset"} + property_instance5 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance5.age_band = "F" property_instance5.insulation_floor_area = 100 property_instance5.roof = { @@ -121,7 +128,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '150', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance5.data = {"county": "Somerset"} roof_recommender5 = RoofRecommendations(property_instance=property_instance5, materials=materials) @@ -136,7 +142,9 @@ class TestRoofRecommendations: def test_loft_insulation_recommendation_270mm_insulation(self): # We shouldn't recommend anything in this case - property_instance6 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Portsmouth"} + property_instance6 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance6.age_band = "F" property_instance6.insulation_floor_area = 100 property_instance6.roof = { @@ -148,7 +156,6 @@ class TestRoofRecommendations: 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '270', 'roof_thermal_transmittance': None, 'roof_insulation_thickness': 'none' } - property_instance6.data = {"county": "Portsmouth"} roof_recommender6 = RoofRecommendations(property_instance=property_instance6, materials=materials) @@ -277,7 +284,9 @@ class TestRoofRecommendations: # "Insulate your room roof with 270mm of Example room roof insulation" def test_flat_no_insulation(self): - property_instance11 = Property(id=11, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Swindon"} + property_instance11 = Property(id=11, address="fake", postcode="fake", epc_record=epc_record) property_instance11.age_band = "D" property_instance11.insulation_floor_area = 33.5 property_instance11.perimeter = 24 @@ -288,7 +297,6 @@ class TestRoofRecommendations: 'is_roof_room': False, 'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none' } - property_instance11.data = {"county": "Swindon"} roof_recommender11 = RoofRecommendations(property_instance=property_instance11, materials=materials) @@ -306,7 +314,9 @@ class TestRoofRecommendations: "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board" def test_flat_insulated(self): - property_instance12 = Property(id=12, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Thurrock"} + property_instance12 = Property(id=12, address="fake", postcode="fake", epc_record=epc_record) property_instance12.age_band = "D" property_instance12.insulation_floor_area = 40 property_instance12.perimeter = 30 @@ -319,7 +329,6 @@ class TestRoofRecommendations: 'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'average' } - property_instance12.data = {"county": "Thurrock"} roof_recommender12 = RoofRecommendations(property_instance=property_instance12, materials=materials) @@ -330,7 +339,9 @@ class TestRoofRecommendations: assert not roof_recommender12.recommendations def test_flat_limited_insulation(self): - property_instance13 = Property(id=12, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Tyne and Wear"} + property_instance13 = Property(id=12, address="fake", postcode="fake", epc_record=epc_record) property_instance13.age_band = "D" property_instance13.insulation_floor_area = 40 property_instance13.perimeter = 40 @@ -342,7 +353,6 @@ class TestRoofRecommendations: 'is_loft': False, 'is_flat': True, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'below average' } - property_instance13.data = {"county": "Tyne and Wear"} roof_recommender13 = RoofRecommendations(property_instance=property_instance13, materials=materials) @@ -362,7 +372,9 @@ class TestRoofRecommendations: "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board" def test_property_above(self): - property_instance14 = Property(id=0, address="fake", postcode="fake") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Suffolk"} + property_instance14 = Property(id=0, address="fake", postcode="fake", epc_record=epc_record) property_instance14.age_band = "F" property_instance14.insulation_floor_area = 100 property_instance14.roof = { @@ -373,7 +385,6 @@ class TestRoofRecommendations: 'is_assumed': False, 'has_dwelling_above': True, 'is_valid': True, 'insulation_thickness': None } - property_instance14.data = {"county": "Suffolk"} roof_recommender14 = RoofRecommendations(property_instance=property_instance14, materials=materials) From 74c36b5456602bde4698603f7bbe3de8c160df6d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 17:44:46 +0000 Subject: [PATCH 13/48] fixed solar tests --- .../tests/test_solar_pv_recommendations.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/recommendations/tests/test_solar_pv_recommendations.py b/recommendations/tests/test_solar_pv_recommendations.py index f2436cb1..5481cb17 100644 --- a/recommendations/tests/test_solar_pv_recommendations.py +++ b/recommendations/tests/test_solar_pv_recommendations.py @@ -1,45 +1,50 @@ import pytest from recommendations.SolarPvRecommendations import SolarPvRecommendations from backend.Property import Property +from etl.epc.Record import EPCRecord class TestSolarPvRecommendations: @pytest.fixture def property_instance_invalid_type(self): # Setup the property_instance with an invalid property type - property_instance_invalid_type = Property(id=1, address="", postcode="") - property_instance_invalid_type.data = { + epc_record = EPCRecord() + epc_record.prepared_epc = { "property-type": "InvalidType", "county": "Broxbourne", "photo-supply": None } + property_instance_invalid_type = Property(id=1, address="", postcode="", epc_record=epc_record) property_instance_invalid_type.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False} return property_instance_invalid_type @pytest.fixture def property_instance_invalid_roof(self): # Setup the property_instance with invalid roof type - property_instance_invalid_roof = Property(id=1, address="", postcode="") - property_instance_invalid_roof.data = { + epc_record = EPCRecord() + epc_record.prepared_epc = { "county": "Huntingdonshire", "property-type": "House", "photo-supply": None } + property_instance_invalid_roof = Property(id=1, address="", postcode="", epc_record=epc_record) property_instance_invalid_roof.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False} return property_instance_invalid_roof @pytest.fixture def property_instance_has_solar_pv(self): # Setup the property_instance without existing solar pv - property_instance_has_solar_pv = Property(id=1, address="", postcode="") - property_instance_has_solar_pv.data = {"photo-supply": "40", "county": "Huntingdonshire", - "property-type": "House"} + epc_record = EPCRecord() + epc_record.prepared_epc = {"photo-supply": "40", "county": "Huntingdonshire", + "property-type": "House"} + property_instance_has_solar_pv = Property(id=1, address="", postcode="", epc_record=epc_record) property_instance_has_solar_pv.roof = {"is_flat": True} return property_instance_has_solar_pv @pytest.fixture def property_instance_valid_all(self): # Setup a valid property_instance that passes all conditions - property_instance_valid_all = Property(id=1, address="", postcode="") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "House", "photo-supply": None, "county": "Huntingdonshire"} + property_instance_valid_all = Property(id=1, address="", postcode="", epc_record=epc_record) property_instance_valid_all.solar_pv_roof_area = 20 property_instance_valid_all.solar_pv_percentage = 40 - property_instance_valid_all.data = {"property-type": "House", "photo-supply": None, "county": "Huntingdonshire"} property_instance_valid_all.roof = {"is_flat": True} return property_instance_valid_all From 40976fd395f06a63ff07a030347d5a8ce218b891 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 17:46:44 +0000 Subject: [PATCH 14/48] fixed ventialtion recs --- .../tests/test_ventilation_recommendations.py | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/recommendations/tests/test_ventilation_recommendations.py b/recommendations/tests/test_ventilation_recommendations.py index 3242b1d1..aa992253 100644 --- a/recommendations/tests/test_ventilation_recommendations.py +++ b/recommendations/tests/test_ventilation_recommendations.py @@ -1,13 +1,15 @@ from backend.Property import Property from recommendations.VentilationRecommendations import VentilationRecommendations from recommendations.tests.test_data.materials import materials +from etl.epc.Record import EPCRecord class TestVentilationRecommendations: def test_natural_ventilation(self): - input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property1.data = {"mechanical-ventilation": "natural"} + epc_record = EPCRecord() + epc_record.prepared_epc = {"mechanical-ventilation": "natural"} + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) recommender = VentilationRecommendations( property_instance=input_property1, @@ -27,8 +29,9 @@ class TestVentilationRecommendations: assert recommender.recommendation[0]["parts"][0]["quantity"] == 2 def test_missing_ventilation(self): - input_property2 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property2.data = {"mechanical-ventilation": None} + epc_record = EPCRecord() + epc_record.prepared_epc = {"mechanical-ventilation": None} + input_property2 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) recommender2 = VentilationRecommendations( property_instance=input_property2, @@ -48,8 +51,9 @@ class TestVentilationRecommendations: assert recommender2.recommendation[0]["parts"][0]["quantity"] == 2 def test_nodata_ventilation(self): - input_property3 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property3.data = {"mechanical-ventilation": "NO DATA!!"} + epc_record = EPCRecord() + epc_record.prepared_epc = {"mechanical-ventilation": "NO DATA!!"} + input_property3 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) recommender3 = VentilationRecommendations( property_instance=input_property3, @@ -69,8 +73,9 @@ class TestVentilationRecommendations: assert recommender3.recommendation[0]["parts"][0]["quantity"] == 2 def test_existing_ventilation_1(self): - input_property4 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property4.data = {"mechanical-ventilation": 'mechanical, extract only'} + epc_record = EPCRecord() + epc_record.prepared_epc = {"mechanical-ventilation": "mechanical, extract only"} + input_property4 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) recommender4 = VentilationRecommendations( property_instance=input_property4, @@ -85,8 +90,9 @@ class TestVentilationRecommendations: assert recommender4.has_ventilaion def test_existing_ventilation_2(self): - input_property5 = Property(id=1, postcode="F4k3 6", address="623 fake street") - input_property5.data = {"mechanical-ventilation": 'mechanical, supply and extract'} + epc_record = EPCRecord() + epc_record.prepared_epc = {"mechanical-ventilation": "mechanical, supply and extract"} + input_property5 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) recommender5 = VentilationRecommendations( property_instance=input_property5, From bbb4892437f5e41f23ac38213e39c7f0bb3f55b6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 18:28:26 +0000 Subject: [PATCH 15/48] fixed recommendation unit tests --- .../tests/test_wall_recommendations.py | 36 ++++--- .../tests/test_window_recommendations.py | 102 ++++++++++-------- 2 files changed, 80 insertions(+), 58 deletions(-) diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py index bfc681f5..580ebb91 100644 --- a/recommendations/tests/test_wall_recommendations.py +++ b/recommendations/tests/test_wall_recommendations.py @@ -7,6 +7,7 @@ from recommendations.WallRecommendations import WallRecommendations from backend.Property import Property from recommendations.recommendation_utils import is_diminishing_returns from recommendations.tests.test_data.materials import materials +from etl.epc.Record import EPCRecord # with open( @@ -231,7 +232,9 @@ class TestWallRecommendationsBase: class TestCavityWallRecommensations: def test_fill_empty_cavity(self): - input_property = Property(id=1, postcode="F4k3", address="123 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "Derbyshire"} + input_property = Property(id=1, postcode="F4k3", address="123 fake street", epc_record=epc_record) input_property.walls = { 'original_description': 'Cavity wall, as built, no insulation (assumed)', 'clean_description': 'Cavity wall, as built, no insulation', @@ -245,7 +248,6 @@ class TestCavityWallRecommensations: } input_property.age_band = "C" input_property.insulation_wall_area = 50 - input_property.data = {"county": "Derbyshire"} recommender = WallRecommendations( property_instance=input_property, @@ -265,7 +267,9 @@ class TestCavityWallRecommensations: assert np.isclose(recommender.recommendations[1]["total"], 2004.6600000000003) def test_fill_partial_filled_cavity(self): - input_property = Property(id=1, postcode="F4k3", address="123 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"county": "County Durham"} + input_property = Property(id=1, postcode="F4k3", address="123 fake street", epc_record=epc_record) input_property.walls = { 'original_description': 'Cavity wall, as built, partial insulation (assumed)', 'clean_description': 'Cavity wall, as built, partial insulation', @@ -279,7 +283,6 @@ class TestCavityWallRecommensations: } input_property.age_band = "C" input_property.insulation_wall_area = 50 - input_property.data = {"county": "County Durham"} recommender = WallRecommendations( property_instance=input_property, @@ -299,7 +302,9 @@ class TestCavityWallRecommensations: assert np.isclose(recommender.recommendations[1]["total"], 1999.9350000000002) def test_system_built_wall(self): - input_property2 = Property(id=1, postcode="F4k3 2", address="223 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Detached"} + input_property2 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record) input_property2.walls = { 'original_description': 'System built, as built, no insulation (assumed)', 'clean_description': 'System built, as built, no insulation', @@ -314,7 +319,6 @@ class TestCavityWallRecommensations: input_property2.age_band = "F" input_property2.insulation_wall_area = 120 input_property2.restricted_measures = False - input_property2.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Detached"} assert input_property2.walls["is_system_built"] @@ -346,7 +350,9 @@ class TestCavityWallRecommensations: assert recommender2.recommendations[6]["parts"][0]["depth"] == 52.5 def test_timber_frame_wall(self): - input_property3 = Property(id=1, postcode="F4k3 2", address="223 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Semi-Detached"} + input_property3 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record) input_property3.walls = { 'original_description': 'Timber frame, as built, no insulation (assumed)', 'clean_description': 'Timber frame, as built, no insulation', @@ -361,7 +367,6 @@ class TestCavityWallRecommensations: input_property3.age_band = "B" input_property3.insulation_wall_area = 99 input_property3.restricted_measures = False - input_property3.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Semi-Detached"} assert input_property3.walls["is_timber_frame"] @@ -388,7 +393,9 @@ class TestCavityWallRecommensations: assert recommender3.recommendations[1]["parts"][0]["depth"] == 150.0 def test_granite_or_whinstone_wall(self): - input_property4 = Property(id=1, postcode="F4k3 2", address="223 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"} + input_property4 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record) input_property4.walls = { 'original_description': 'Granite or whinstone, as built, no insulation (assumed)', 'clean_description': 'Granite or whinstone, as built, no insulation', @@ -403,7 +410,6 @@ class TestCavityWallRecommensations: input_property4.age_band = "A" input_property4.insulation_wall_area = 223 input_property4.restricted_measures = False - input_property4.data = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"} assert input_property4.walls["is_granite_or_whinstone"] @@ -430,7 +436,9 @@ class TestCavityWallRecommensations: assert recommender4.recommendations[1]["parts"][0]["depth"] == 150 def test_cob_wall(self): - input_property5 = Property(id=1, postcode="F4k3 2", address="223 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"} + input_property5 = Property(id=1, postcode="F4k3 2", address="223 fake street", epc_record=epc_record) input_property5.walls = { 'original_description': 'Cob, as built', 'clean_description': 'Cob, as built', @@ -445,7 +453,6 @@ class TestCavityWallRecommensations: input_property5.age_band = "E" input_property5.insulation_wall_area = 77 input_property5.restricted_measures = False - input_property5.data = {"property-type": "Bungalow", "county": "Derbyshire", "built-form": "Detached"} assert input_property5.walls["is_cob"] @@ -472,7 +479,9 @@ class TestCavityWallRecommensations: assert recommender5.recommendations[3]["parts"][0]["depth"] == 100 def test_sandstone_or_limestone_wall(self): - input_property6 = Property(id=1, postcode="F4k3 6", address="623 fake street") + epc_record = EPCRecord() + epc_record.prepared_epc = {"property-type": "House", "county": "Derbyshire", "built-form": "Mid-Terrace"} + input_property6 = Property(id=1, postcode="F4k3 6", address="623 fake street", epc_record=epc_record) input_property6.walls = { 'original_description': 'Sandstone or limestone, as built, no insulation (assumed)', 'clean_description': 'Sandstone or limestone, as built, no insulation', @@ -487,7 +496,6 @@ class TestCavityWallRecommensations: input_property6.age_band = "F" input_property6.insulation_wall_area = 350 input_property6.restricted_measures = False - input_property6.data = {"property-type": "House", "county": "Derbyshire", "built-form": "Mid-Terrace"} assert input_property6.walls["is_sandstone_or_limestone"] diff --git a/recommendations/tests/test_window_recommendations.py b/recommendations/tests/test_window_recommendations.py index 664a1e39..36e70834 100644 --- a/recommendations/tests/test_window_recommendations.py +++ b/recommendations/tests/test_window_recommendations.py @@ -1,6 +1,7 @@ from recommendations.WindowsRecommendations import WindowsRecommendations from backend.Property import Property from recommendations.tests.test_data.materials import materials +from etl.epc.Record import EPCRecord class TestWindowRecommendations: @@ -10,16 +11,17 @@ class TestWindowRecommendations: For this property, we expect all windows to be single glazed and should recommend full double glazing :return: """ - + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 0, + "uprn": 0 + } property_1 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 0, - "uprn": 0 - } + epc_record=epc_record ) property_1.windows = { 'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': 'full', @@ -47,16 +49,17 @@ class TestWindowRecommendations: double glazing :return: """ - + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 33, + "uprn": 0 + } property_2 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 33, - "uprn": 0 - } + epc_record=epc_record ) property_2.windows = {'original_description': 'Mostly double glazing', 'has_glazing': True, 'glazing_coverage': 'most', @@ -81,16 +84,17 @@ class TestWindowRecommendations: This property has full double glazing so we shouldn't recommend anything :return: """ - + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 100, + "uprn": 0 + } property_3 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 80, - "uprn": 0 - } + epc_record=epc_record ) property_3.windows = {'original_description': 'Fully double glazed', 'has_glazing': True, 'glazing_coverage': 'full', @@ -106,15 +110,17 @@ class TestWindowRecommendations: assert not recommender3.recommendation def test_fully_secondary_glazed(self): + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 100, + "uprn": 0 + } property_4 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 100, - "uprn": 0 - } + epc_record=epc_record ) property_4.windows = {'original_description': 'Full secondary glazing', 'has_glazing': True, 'glazing_coverage': 'full', @@ -130,15 +136,17 @@ class TestWindowRecommendations: assert not recommender4.recommendation def test_partial_secondary_glazing(self): + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 50, + "uprn": 0 + } property_5 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 50, - "uprn": 0 - } + epc_record=epc_record ) property_5.windows = {'original_description': 'Partial secondary glazing', 'has_glazing': True, 'glazing_coverage': 'partial', @@ -160,15 +168,18 @@ class TestWindowRecommendations: 'labour_days': 0.8125, 'is_secondary_glazing': True}] def test_single_glazed_restricted_measures(self): + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 0, + "uprn": 0 + } + property_6 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 0, - "uprn": 0 - } + epc_record=epc_record ) property_6.windows = {'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': None, 'glazing_type': 'single', @@ -195,15 +206,17 @@ class TestWindowRecommendations: ] def test_full_triple_glazed(self): + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 100, + "uprn": 0 + } property_7 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 100, - "uprn": 0 - } + epc_record=epc_record ) property_7.windows = {'original_description': 'Fully triple glazed', 'has_glazing': True, 'glazing_coverage': 'full', @@ -222,16 +235,17 @@ class TestWindowRecommendations: """ We should just recommend double glazing to the remaining windows, since it's a cheaper option """ - + epc_record = EPCRecord() + epc_record.prepared_epc = { + "county": "Wychavon", + "multi-glaze-proportion": 80, + "uprn": 1 + } property_8 = Property( id=1, postcode='1', address='1', - data={ - "county": "Wychavon", - "multi-glaze-proportion": 80, - "uprn": 1 - } + epc_record=epc_record ) property_8.windows = {'original_description': 'Mostly triple glazing', 'has_glazing': True, 'glazing_coverage': 'most', From 4adfa0bb6228278b1c3162b551ac8960f60cf48b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 19 Jan 2024 19:44:38 +0000 Subject: [PATCH 16/48] fixed all tests --- backend/tests/test_property.py | 53 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/backend/tests/test_property.py b/backend/tests/test_property.py index 09594a40..43149791 100644 --- a/backend/tests/test_property.py +++ b/backend/tests/test_property.py @@ -1,9 +1,9 @@ import pandas as pd import pytest from unittest.mock import Mock -from epc_api.client import EpcClient from backend.Property import Property from etl.epc_clean.EpcClean import EpcClean +from etl.epc.Record import EPCRecord # Define some test data mock_epc_response = { @@ -196,12 +196,21 @@ class TestProperty: @pytest.fixture(autouse=True) def property_instance(self, mock_cleaner): - property_instance = Property(id=1, postcode="AB12CD", address="Test Address", data=mock_epc_response["rows"][0]) + epc_record = EPCRecord() + epc_record.prepared_epc = mock_epc_response["rows"][0] + + property_instance = Property(id=1, postcode="AB12CD", address="Test Address", epc_record=epc_record) + property_instance.number_of_floors = 2 + property_instance.number_of_rooms = 5 + property_instance.floor_area = 100 + property_instance.floor_height = 2.5 return property_instance @pytest.fixture(autouse=True) def property_instance_dupe_data(self): - property_instance_dupe_data = Property(id=2, postcode="AB12CD", address="Test Address") + epc_record = EPCRecord() + epc_record.prepared_epc = mock_epc_response_dupe["rows"][0] + property_instance_dupe_data = Property(id=2, postcode="AB12CD", address="Test Address", epc_record=epc_record) return property_instance_dupe_data # @pytest.fixture @@ -271,15 +280,17 @@ class TestProperty: return mock_cleaner def test_init(self): - inst1 = Property(0, postcode="AB12CD", address="Test Address") + epc_record = EPCRecord() + epc_record.prepared_epc = {"uprn": 1} + inst1 = Property(0, postcode="AB12CD", address="Test Address", epc_record=epc_record) - assert inst1.data is None + assert inst1.data is not None - inst2 = Property(3, "AB12CD", "Test Address") + inst2 = Property(3, "AB12CD", "Test Address", epc_record=epc_record) assert inst2.id == 3 - inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data", "uprn": 123}) - assert inst3.data == {"some": "data", "uprn": 123} + inst3 = Property(4, "AB12CD", "Test Address", epc_record=epc_record) + assert inst3.data == {"uprn": 1} def test_get_components( self, property_instance, mock_cleaner, mock_photo_supply_lookup, mock_floor_area_decile_thresholds @@ -372,7 +383,9 @@ class TestProperty: property_instance.get_components(cleaned, mock_photo_supply_lookup, mock_floor_area_decile_thresholds) def test_set_spatial(self): - prop = Property(1, postcode="AB12CD", address="Test Address") + epc_record = EPCRecord() + epc_record.prepared_epc = mock_epc_response["rows"][0] + prop = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record) spatial1 = pd.DataFrame([{ 'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238, @@ -386,7 +399,7 @@ class TestProperty: assert prop.is_heritage assert prop.restricted_measures - prop2 = Property(1, "AB12CD", "Test Address") + prop2 = Property(1, "AB12CD", "Test Address", epc_record=epc_record) spatial2 = pd.DataFrame([{ 'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238, @@ -403,8 +416,9 @@ class TestProperty: def test_set_floor_level(self): # In this case, we have a flat which looks looks it's on the first floor, but it's actually on the ground # floor, so we should set floor_level to 0 - prop = Property(1, postcode="AB12CD", address="Test Address") - prop.data = {'floor-level': '01', 'property-type': 'Flat'} + epc_record = EPCRecord() + epc_record.prepared_epc = {'floor-level': '01', 'property-type': 'Flat'} + prop = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record) prop.floor = { 'original_description': 'Solid, no insulation (assumed)', 'clean_description': 'Solid, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': True, @@ -419,8 +433,9 @@ class TestProperty: # This property is labelled as being on the ground floor but actually has another property below # so we set floor level to 1 - prop2 = Property(1, postcode="AB12CD", address="Test Address") - prop2.data = {'floor-level': 'Ground', 'property-type': 'Flat'} + epc_record = EPCRecord() + epc_record.prepared_epc = {'floor-level': 'Ground', 'property-type': 'Flat'} + prop2 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record) prop2.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False, @@ -434,8 +449,9 @@ class TestProperty: assert prop2.floor_level == 1 # this property is correctly labelled as being on the 2nd floor - prop3 = Property(1, postcode="AB12CD", address="Test Address") - prop3.data = {'floor-level': '02', 'property-type': 'Flat'} + epc_record = EPCRecord() + epc_record.prepared_epc = {'floor-level': '02', 'property-type': 'Flat'} + prop3 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record) prop3.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False, @@ -449,8 +465,9 @@ class TestProperty: assert prop3.floor_level == 2 # Example of a house - prop4 = Property(1, postcode="AB12CD", address="Test Address") - prop4.data = {'floor-level': '', 'property-type': 'House'} + epc_record = EPCRecord() + epc_record.prepared_epc = {'floor-level': '', 'property-type': 'House'} + prop4 = Property(1, postcode="AB12CD", address="Test Address", epc_record=epc_record) prop4.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_assumed': False, From 01a4628d206be30ed88c195fa9b7b04909a53637 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:03:45 +0000 Subject: [PATCH 17/48] read in asset list for ha 1, working on ha 6 --- etl/eligibility/ha_15_32/app.py | 2 - .../ha_15_32/ha_analysis_batch_3.py | 182 ++++++++++++++++++ 2 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 etl/eligibility/ha_15_32/ha_analysis_batch_3.py diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 76aadcc4..ce216364 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -16,8 +16,6 @@ from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from backend.Property import Property from etl.eligibility.Eligibility import Eligibility -from etl.epc.DataProcessor import DataProcessor -from backend.app.plan.utils import create_recommendation_scoring_data from etl.epc.settings import COLUMNS_TO_MERGE_ON from backend.ml_models.api import ModelApi diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py new file mode 100644 index 00000000..7c28d481 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -0,0 +1,182 @@ +import os +import msgpack +import openpyxl +from pathlib import Path +from tqdm import tqdm +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet +from utils.logger import setup_logger +from dotenv import load_dotenv +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +class DataLoader: + COLOUR_CONFIG = { + "ha_1": { + "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + }, + "ha_6": { + "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + }, + } + + def __init__(self, files): + self.files = files + + def load_asset_list(self, file_path, ha_name, sheet_name=None): + workbook = openpyxl.load_workbook(file_path) + if sheet_name is not None: + sheet = workbook[sheet_name] + else: + sheet = workbook.active + sheet_colnames = [cell.value for cell in sheet[1]] + + rows_data = [] + rows_colors = [] + for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + rows_data.append(row_data) + rows_colors.append(row_color) + + asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) + asset_list['row_color'] = rows_colors + + asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] + + asset_list["row_colour_name"] = np.where( + asset_list["row_color"] == asset_list_colours["red"], "red", + np.where(asset_list["row_color"] == asset_list_colours["green"], "green", "yellow") + ) + + asset_list["row_meaning"] = np.where( + asset_list["row_colour_name"] == "red", "does not meet criteria", + np.where( + asset_list["row_colour_name"] == "green", "identified potential eco works (CWI)", "maybe in the future" + ) + ) + + return asset_list + + def load_survey_list(self, file_path, ha_name, sheet_name=None): + survey_workbook = openpyxl.load_workbook(file_path) + if sheet_name is not None: + survey_sheet = survey_workbook[sheet_name] + else: + survey_sheet = survey_workbook.active + + survey_rows = [] + survey_colors = [] + + for row in tqdm(survey_sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + survey_rows.append(row_data) + survey_colors.append(row_color) + + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + + survey_list["row_colour"] = survey_colors + survey_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] + + # The survey list has 4 possible colours: + # PURPLE - Installer advised install complete and a complimentary post works EPC has been completed. + # GREEN - Installer advised install complete. + # RED - Cancelled + # NO FILL - No official update from installer (could be installed or cancelled) + + survey_list["row_colour_name"] = np.where( + survey_list["row_colour"] == survey_list_colours["red"], "red", + np.where(survey_list["row_colour"] == survey_list_colours["green"], "green", + np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple", "yellow")) + ) + + survey_list["row_meaning"] = np.where( + survey_list["row_colour_name"] == "red", "Cancelled", + np.where( + survey_list["row_colour_name"] == "green", + "Installer advised install complete", + np.where( + survey_list["row_colour_name"] == "purple", + "Installer advised install complete and a complimentary post works EPC has been completed", + "No official update from installer (could be installed or cancelled)" + ) + ) + ) + + return survey_list + + def load(self): + + data = {} + for ha_name, file_config in self.files.items(): + # Load asset list + # logger.info("LOading asset list for {}".format(ha_name)) + asset_list = self.load_asset_list( + file_path=file_config["asset_list"]["filepath"], + ha_name=ha_name, + sheet_name=file_config["asset_list"]["sheetname"] + ) + + if file_config.get("survey_list"): + survey_list = self.load_survey_list( + file_path=file_config["survey_list"]["filepath"], + ha_name=ha_name, + sheet_name=file_config["survey_list"]["sheetname"] + ) + else: + survey_list = None + + data[ha_name] = { + "asset_list": asset_list, + "survey_list": survey_list + } + + +def app(): + """ + This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107. + Only HA 6 has surveys + :return: + """ + + files = { + "ha_1": { + "asset_list": { + "filepath": "etl/eligibility/ha_15_32/HA 1 - ASSET LIST.xlsx", + "sheetname": "HA 1" + } + }, + "ha_6": { + "asset_list": { + "filepath": "etl/eligibility/ha_15_32/HA 6 - ASSET LIST.xlsx", + "sheetname": "HA 6" + }, + "survey_list": { + "filepath": "etl/eligibility/ha_15_32/HA 6 - SURVEY LIST.xlsx", + "sheetname": "HA 6" + } + }, + "ha_14": {"asset_list": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx"}, + "ha_39": {"asset_list": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx"}, + "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"} + } + + loader = DataLoader(files) From b22003d2066b4de6b9d3c1aba9091cc5bf98b09b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:12:06 +0000 Subject: [PATCH 18/48] Read in survey list for HA 6 --- .../ha_15_32/ha_analysis_batch_3.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7c28d481..9a95cd21 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -33,6 +33,9 @@ class DataLoader: }, "ha_6": { "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + "survey_list": { + "green": "FF92D050", "purple": "FF7030A0", "red": "FFFF0000", "blue": "FF00B0F0" + } }, } @@ -57,6 +60,7 @@ class DataLoader: rows_colors.append(row_color) asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) + asset_list = asset_list.loc[:, asset_list.columns.notnull()] asset_list['row_color'] = rows_colors asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] @@ -92,20 +96,24 @@ class DataLoader: survey_colors.append(row_color) survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + # Remove columns that are None + survey_list = survey_list.loc[:, survey_list.columns.notnull()] survey_list["row_colour"] = survey_colors - survey_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] + survey_list_colours = self.COLOUR_CONFIG[ha_name]["survey_list"] # The survey list has 4 possible colours: # PURPLE - Installer advised install complete and a complimentary post works EPC has been completed. # GREEN - Installer advised install complete. # RED - Cancelled + # BLUE - Loft Only Installed # NO FILL - No official update from installer (could be installed or cancelled) survey_list["row_colour_name"] = np.where( survey_list["row_colour"] == survey_list_colours["red"], "red", np.where(survey_list["row_colour"] == survey_list_colours["green"], "green", - np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple", "yellow")) + np.where(survey_list["row_colour"] == survey_list_colours["purple"], "purple", + np.where(survey_list["row_colour"] == survey_list_colours["blue"], "blue", "no fill"))) ) survey_list["row_meaning"] = np.where( @@ -116,7 +124,11 @@ class DataLoader: np.where( survey_list["row_colour_name"] == "purple", "Installer advised install complete and a complimentary post works EPC has been completed", - "No official update from installer (could be installed or cancelled)" + np.where( + survey_list["row_colour_name"] == "blue", + "Loft Only Installed", + "No official update from installer (could be installed or cancelled)" + ) ) ) ) From f1670498d1fcc55473f63ca76287fe3309a648d7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:15:56 +0000 Subject: [PATCH 19/48] Setting up to merge HA6 --- .../ha_15_32/ha_analysis_batch_3.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9a95cd21..bd2c6c99 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -77,9 +77,12 @@ class DataLoader: ) ) + # Add in asset_list_row_id + asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))] + return asset_list - def load_survey_list(self, file_path, ha_name, sheet_name=None): + def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None): survey_workbook = openpyxl.load_workbook(file_path) if sheet_name is not None: survey_sheet = survey_workbook[sheet_name] @@ -133,8 +136,22 @@ class DataLoader: ) ) + # Add in asset_list_row_id + survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))] + + # We now do the matching between the asset list and the survey list. + # What we'll get from this is a lookup table from the asset list to the survey list + + if ha_name == "ha_6": + self.merge_ha_6(asset_list, survey_list) + else: + raise NotImplementedError("Only HA 6 has surveys") + return survey_list + def merge_ha_6(self, asset_list, survey_list): + pass + def load(self): data = {} From cf9253d06201bbadca263eef269de973957b9556 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:41:52 +0000 Subject: [PATCH 20/48] working on matching code for HA6 asset and survey lists --- .../ha_15_32/ha_analysis_batch_3.py | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bd2c6c99..7fbddd54 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -52,7 +52,7 @@ class DataLoader: rows_data = [] rows_colors = [] - for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers row_data = [cell.value for cell in row] # This will get you the cell values row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None # row_color = COLOR_INDEX[row_color] @@ -137,7 +137,7 @@ class DataLoader: ) # Add in asset_list_row_id - survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))] + survey_list["survey_list_row_id"] = [ha_name + "_surveys_" + str(i) for i in range(0, len(survey_list))] # We now do the matching between the asset list and the survey list. # What we'll get from this is a lookup table from the asset list to the survey list @@ -150,14 +150,53 @@ class DataLoader: return survey_list def merge_ha_6(self, asset_list, survey_list): - pass + + # Prepare the asset list + asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip() + asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip() + + split_addresses = asset_list['matching_address'].str.split(',', expand=True) + split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] + house_numbers = split_addresses['temp'].str.split(' ', expand=True) + house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"] + + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + del split_addresses, house_numbers + + matching_lookup = [] + for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): + house_number = row["NO."] + if isinstance(house_number, str): + house_number = house_number.lower().strip() + + # Filter on the first line of the address + df = asset_list[ + asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) + ].copy() + df = df[df["matching_address"].str.contains(str(house_number))] + if df.shape[0] != 1: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"].lower()) + raise ValueError("Investigate") + + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": df["asset_list_row_id"].values[0], + } + ) def load(self): data = {} for ha_name, file_config in self.files.items(): # Load asset list - # logger.info("LOading asset list for {}".format(ha_name)) + logger.info("Loading asset list for {}".format(ha_name)) asset_list = self.load_asset_list( file_path=file_config["asset_list"]["filepath"], ha_name=ha_name, @@ -165,6 +204,7 @@ class DataLoader: ) if file_config.get("survey_list"): + logger.info("Loading survey list for {}".format(ha_name)) survey_list = self.load_survey_list( file_path=file_config["survey_list"]["filepath"], ha_name=ha_name, @@ -209,3 +249,4 @@ def app(): } loader = DataLoader(files) + loader.load() From 8c61cca85d821b5fa7c2901bfa164249f8c1dce6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:47:08 +0000 Subject: [PATCH 21/48] matching 7% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7fbddd54..257e71d2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -151,9 +151,12 @@ class DataLoader: def merge_ha_6(self, asset_list, survey_list): + # Correct the asset list + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") + # Prepare the asset list - asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip() - asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip() + asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip() split_addresses = asset_list['matching_address'].str.split(',', expand=True) split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] From 1e52fe7fb97061d64194456e6d56bd814b660cf7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:51:28 +0000 Subject: [PATCH 22/48] 11% complete matching --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 257e71d2..3bfea948 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -166,6 +166,14 @@ class DataLoader: asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) del split_addresses, house_numbers + # Correct the survey list + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Seabridge Road", "Seabridge Lane" + ) + + # Strip out /KNUTTON from the street name + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "") + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 7a2c90cbf36c7e1d73452527683d52e8719be382 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:54:20 +0000 Subject: [PATCH 23/48] matching 23% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3bfea948..dfbd4fa4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -174,6 +174,18 @@ class DataLoader: # Strip out /KNUTTON from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/KNUTTON", "") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Clevend Road", "Cleveland Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "TURNERS AVENUE", "Turner Avenue" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "WEDGEWWOD AVENUE", "Wedgwood Avenue" + ) + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From ed0bbf44c76a1303fac20814875f3e99798ed9bd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 12:02:43 +0000 Subject: [PATCH 24/48] matching 34% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index dfbd4fa4..9cae6e37 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -153,6 +153,8 @@ class DataLoader: # Correct the asset list asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") # Prepare the asset list asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() @@ -185,6 +187,8 @@ class DataLoader: survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( "WEDGEWWOD AVENUE", "Wedgwood Avenue" ) + # The cherrytree record has wrong postcode + survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP" matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): From 702de41d464e27cf4dad6db118dd1ac367e99c27 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 12:36:03 +0000 Subject: [PATCH 25/48] matching 42% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9cae6e37..5b2cefcd 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -190,6 +190,20 @@ class DataLoader: # The cherrytree record has wrong postcode survey_list.loc[survey_list["Street / Block Name"] == "Cherrytree road", "Post Code"] = "ST5 7BP" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "MONUMENT RD", "Monument Road" + ) + + # Generally replace " RD" with " Road" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" RD", " Road") + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "HILARY Road", "Hillary Road" + ) + + # Remove full stops from the street name + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "") + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 4601edbf27f3dd4de5d6df05495d3cfdd1f4e74d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 12:42:01 +0000 Subject: [PATCH 26/48] matching 51% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 5b2cefcd..ff717f86 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -204,6 +204,21 @@ class DataLoader: # Remove full stops from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(".", "") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Chatworth road", "Chatsworth Place" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Wood Croft", "Woodcroft" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Milstone Avenue", "Millstone Avenue" + ) + + # Strip out /TALKE from the street name + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "") + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 96893aae14c3896a62fbb8c76a58e41ea567a3e3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 16:51:43 +0000 Subject: [PATCH 27/48] matching 61% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ff717f86..261c0fd4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -219,6 +219,10 @@ class DataLoader: # Strip out /TALKE from the street name survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/TALKE", "") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Woodcutts Street", "Woodshutts Street" + ) + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From a2a8bc012e51b1f2d9977e865e216c10f14b9ba0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 17:13:35 +0000 Subject: [PATCH 28/48] matching 82% complete --- .../ha_15_32/ha_analysis_batch_3.py | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 261c0fd4..3a5b4ab4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -142,12 +142,13 @@ class DataLoader: # We now do the matching between the asset list and the survey list. # What we'll get from this is a lookup table from the asset list to the survey list + matched_lookup = pd.DataFrame() if ha_name == "ha_6": - self.merge_ha_6(asset_list, survey_list) + matched_lookup = self.merge_ha_6(asset_list, survey_list) else: raise NotImplementedError("Only HA 6 has surveys") - return survey_list + return survey_list, matched_lookup def merge_ha_6(self, asset_list, survey_list): @@ -223,6 +224,42 @@ class DataLoader: "Woodcutts Street", "Woodshutts Street" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "HILLARY AVENUE", "Hillary Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "HILLARY AVENUE", "Hillary Road" + ) + + # Replace " Rd" with " Road" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(" Rd", " Road") + + # We have a record listed as 19, MAPLE AVENUE ST7 1JX, when it should be 19, Hollins Crescent ST7 1JX + survey_list.loc[ + (survey_list["Street / Block Name"] == "MAPLE AVENUE") & + (survey_list["NO."].isin([19])) & + (survey_list["Post Code"] == "ST7 1JX"), + "Street / Block Name" + ] = "Hollins Crescent" + + # However, some of the maple avenue records, are indeed Maple avenue, but are listed with the wrong postcode. + # E.g. number 26 + survey_list.loc[ + (survey_list["Street / Block Name"] == "MAPLE AVENUE") & + (survey_list["NO."].isin([26])) & + (survey_list["Post Code"] == "ST7 1JX"), + "Post Code" + ] = "ST7 1JW" + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BURSLEY Road", "Bursley Way" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Brittania Avenue", "Brittain Avenue" + ) + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 48ec641675a34b4c7c8b6c1cdbae62ecf2cc45d3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 17:17:57 +0000 Subject: [PATCH 29/48] 82% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3a5b4ab4..61a90d14 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -156,6 +156,7 @@ class DataLoader: asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way") # Prepare the asset list asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() @@ -260,6 +261,9 @@ class DataLoader: "Brittania Avenue", "Brittain Avenue" ) + # Moffat Way + # Moffatt Way + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 90a47d765b315f78c47ce1f4db4851b5cfa9e633 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 17:46:20 +0000 Subject: [PATCH 30/48] matching 89% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 61a90d14..05c0299c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -142,7 +142,6 @@ class DataLoader: # We now do the matching between the asset list and the survey list. # What we'll get from this is a lookup table from the asset list to the survey list - matched_lookup = pd.DataFrame() if ha_name == "ha_6": matched_lookup = self.merge_ha_6(asset_list, survey_list) else: @@ -261,8 +260,9 @@ class DataLoader: "Brittania Avenue", "Brittain Avenue" ) - # Moffat Way - # Moffatt Way + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Hawthorn Road", "Hawthorne Road" + ) matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): @@ -274,6 +274,7 @@ class DataLoader: df = asset_list[ asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) ].copy() + df = df[df["matching_address"].str.contains(str(house_number))] if df.shape[0] != 1: df = df[df["HouseNo"] == str(house_number)] From 4ed4c154805eb2f907b07200bd96b4bec8ed0566 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 17:47:40 +0000 Subject: [PATCH 31/48] matching 95% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 05c0299c..52117d17 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -264,6 +264,10 @@ class DataLoader: "Hawthorn Road", "Hawthorne Road" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Eastdale Place", "Easdale Place" + ) + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): house_number = row["NO."] From 709a50f02ef5b263d6c83b46aac8a1839ed98511 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 18:46:43 +0000 Subject: [PATCH 32/48] setting up cache --- .../ha_15_32/ha_analysis_batch_3.py | 82 +++++++++++++++++-- utils/s3.py | 54 +++++++++++- 2 files changed, 129 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 52117d17..bf91d8b6 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -6,7 +6,7 @@ from tqdm import tqdm from datetime import datetime import pandas as pd import numpy as np -from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3 from utils.logger import setup_logger from dotenv import load_dotenv from tqdm import tqdm @@ -39,8 +39,11 @@ class DataLoader: }, } - def __init__(self, files): + def __init__(self, files, use_cache): self.files = files + self.use_cache = use_cache + + self.data = {} def load_asset_list(self, file_path, ha_name, sheet_name=None): workbook = openpyxl.load_workbook(file_path) @@ -149,7 +152,8 @@ class DataLoader: return survey_list, matched_lookup - def merge_ha_6(self, asset_list, survey_list): + @staticmethod + def merge_ha_6(asset_list, survey_list): # Correct the asset list asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") @@ -268,8 +272,39 @@ class DataLoader: "Eastdale Place", "Easdale Place" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Wedgewood Road", "Wedgwood Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Droitwich Drive", "Droitwich Close" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Longdale Road", "Langdale Road" + ) + + # We have 2 addresses in the survey list that don't have postcodes. We'll manually add them in + survey_list.loc[ + (survey_list["Street / Block Name"] == "Rogers Avenue") & + pd.isnull(survey_list["Post Code"]), + "Post Code" + ] = "ST5 9AT" + + survey_list.loc[ + (survey_list["Street / Block Name"] == "Cedar Road") & + pd.isnull(survey_list["Post Code"]), + "Post Code" + ] = "ST5 7BY" + + missed_postcodes = [ + postcode.lower() for postcode in survey_list["Post Code"] if + postcode.lower() not in asset_list["matching_postcode"].values + ] + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): + house_number = row["NO."] if isinstance(house_number, str): house_number = house_number.lower().strip() @@ -285,6 +320,16 @@ class DataLoader: if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] if df.shape[0] != 1: + postcode_lower = row["Post Code"].lower() + if postcode_lower in missed_postcodes: + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": None, + } + ) + continue + print(row["Street / Block Name"]) print(house_number) print(row["Post Code"].lower()) @@ -297,8 +342,19 @@ class DataLoader: } ) + matching_lookup = pd.DataFrame(matching_lookup) + + return matching_lookup + def load(self): + if self.use_cache: + self.data = read_pickle_from_s3( + bucket_name="retrofit-datalake-dev", + s3_file_name="ha-analysis/batch3-inputs.pickle", + ) + return + data = {} for ha_name, file_config in self.files.items(): # Load asset list @@ -311,19 +367,31 @@ class DataLoader: if file_config.get("survey_list"): logger.info("Loading survey list for {}".format(ha_name)) - survey_list = self.load_survey_list( + survey_list, matched_lookup = self.load_survey_list( file_path=file_config["survey_list"]["filepath"], ha_name=ha_name, sheet_name=file_config["survey_list"]["sheetname"] ) else: survey_list = None + matched_lookup = None data[ha_name] = { "asset_list": asset_list, - "survey_list": survey_list + "survey_list": survey_list, + "matched_lookup": matched_lookup } + self.data = data + + # Cache the data in s3 + # We need to pickle the data and store in s3 + save_pickle_to_s3( + data=self.data, + bucket_name="retrofit-datalake-dev", + s3_file_name="ha-analysis/batch3-inputs.pickle", + ) + def app(): """ @@ -332,6 +400,8 @@ def app(): :return: """ + use_cache = False + files = { "ha_1": { "asset_list": { @@ -354,5 +424,5 @@ def app(): "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"} } - loader = DataLoader(files) + loader = DataLoader(files, use_cache) loader.load() diff --git a/utils/s3.py b/utils/s3.py index e63b7192..3d6cf038 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -1,3 +1,4 @@ +import pickle import boto3 from io import BytesIO, StringIO from botocore.exceptions import NoCredentialsError, PartialCredentialsError @@ -141,5 +142,56 @@ def save_csv_to_s3(dataframe, bucket_name, file_name): s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=file_name) return True except Exception as e: - print(f"An error occurred: {e}") + logger.error(f"An error occurred: {e}") return False + + +def save_pickle_to_s3(data, bucket_name, s3_file_name): + """ + Save an object to an S3 bucket as a pickle file. + + :param data: The data to save + :param bucket_name: The name of the S3 bucket + :param s3_file_name: The file name to use for the saved data in S3 (should end in .pkl) + """ + # Serialize data to a pickle format + try: + serialized_data = pickle.dumps(data) + except Exception as e: + print(f'Failed to serialize data: {str(e)}') + return + + # Use save_data_to_s3 function to upload the serialized data to S3 + save_data_to_s3(serialized_data, bucket_name, s3_file_name) + + +def read_pickle_from_s3(bucket_name, s3_file_name): + """ + Read a pickle file from an S3 bucket and return the data. + + :param bucket_name: The name of the S3 bucket + :param s3_file_name: The file name of the pickle file in S3 + :return: The data read from the pickle file + """ + try: + s3 = boto3.client('s3') + s3_response = s3.get_object(Bucket=bucket_name, Key=s3_file_name) + serialized_data = s3_response['Body'].read() + except NoCredentialsError: + logger.errpr("Credentials not available.") + return None + except PartialCredentialsError: + logger.errpr("Incomplete credentials provided.") + return None + except Exception as e: + logger.errpr(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}') + return None + + # Deserialize data from pickle format + try: + data = pickle.loads(serialized_data) + except Exception as e: + logger.errpr(f'Failed to deserialize data: {str(e)}') + return None + + return data From 0620c45a223e542218caa84211c695e1461b385c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 18:59:53 +0000 Subject: [PATCH 33/48] Added read for other ha files --- .../ha_15_32/ha_analysis_batch_3.py | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bf91d8b6..85f8704d 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -37,6 +37,15 @@ class DataLoader: "green": "FF92D050", "purple": "FF7030A0", "red": "FFFF0000", "blue": "FF00B0F0" } }, + "ha_14": { + "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + }, + "ha_39": { + "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + }, + "ha_107": { + "asset_list": {"red": "FFFF0000", "green": "FF00B050"}, + } } def __init__(self, files, use_cache): @@ -368,6 +377,7 @@ class DataLoader: if file_config.get("survey_list"): logger.info("Loading survey list for {}".format(ha_name)) survey_list, matched_lookup = self.load_survey_list( + asset_list=asset_list, file_path=file_config["survey_list"]["filepath"], ha_name=ha_name, sheet_name=file_config["survey_list"]["sheetname"] @@ -419,9 +429,24 @@ def app(): "sheetname": "HA 6" } }, - "ha_14": {"asset_list": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx"}, - "ha_39": {"asset_list": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx"}, - "ha_107": {"asset_list": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx"} + "ha_14": { + "asset_list": { + "filepath": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx", + "sheetname": "HA 14" + } + }, + "ha_39": { + "asset_list": { + "filepath": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx", + "sheetname": "Sheet1" + } + }, + "ha_107": { + "asset_list": { + "filepath": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx", + "sheetname": "HA 107" + } + } } loader = DataLoader(files, use_cache) From 9ac6b25b9fa1adf91926109d6a5610d50bee28b8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jan 2024 18:06:34 +0000 Subject: [PATCH 34/48] improving data read code to create standardised matching_address and house number --- backend/ml_models/Valuation.py | 2 + .../ha_15_32/ha_analysis_batch_3.py | 134 +++++++++++++++--- etl/testing_data/livewest_pilot.py | 38 +++++ .../the_guiness_partnership_pilot.py | 38 +++++ 4 files changed, 192 insertions(+), 20 deletions(-) create mode 100644 etl/testing_data/livewest_pilot.py create mode 100644 etl/testing_data/the_guiness_partnership_pilot.py diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index dadef9a9..ff771252 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -22,6 +22,8 @@ class PropertyValuation: 100021192109: 650000, # Based on Zoopla 766249482: 358000, # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached 100120703802: 277000, # Based on Zoopla + 10014469685: 286000, # Based on Zoopla + 10001328782: 196000, # Based on Zoopla } # We base our valuation uplifts on a number of sources diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 85f8704d..54cd7c58 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,8 +1,7 @@ import os -import msgpack import openpyxl from pathlib import Path -from tqdm import tqdm +import msgpack from datetime import datetime import pandas as pd import numpy as np @@ -48,6 +47,14 @@ class DataLoader: } } + MIN_ROWS = { + "ha_1": 2, + "ha_6": 2, + "ha_14": 3, # The spreadsheet starts from the third row + "ha_39": 2, + "ha_107": 2, + } + def __init__(self, files, use_cache): self.files = files self.use_cache = use_cache @@ -60,11 +67,14 @@ class DataLoader: sheet = workbook[sheet_name] else: sheet = workbook.active - sheet_colnames = [cell.value for cell in sheet[1]] + sheet_colnames = [cell.value for cell in sheet[self.MIN_ROWS[ha_name] - 1]] rows_data = [] rows_colors = [] - for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers + for row in tqdm( + sheet.iter_rows(min_row=self.MIN_ROWS[ha_name], values_only=False) + ): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None # row_color = COLOR_INDEX[row_color] @@ -73,8 +83,12 @@ class DataLoader: asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) asset_list = asset_list.loc[:, asset_list.columns.notnull()] + asset_list['row_color'] = rows_colors + # Remove entirely empty roww - consider all rows apart from row_color + asset_list = asset_list.loc[asset_list.loc[:, asset_list.columns != 'row_color'].notnull().any(axis=1)] + asset_list_colours = self.COLOUR_CONFIG[ha_name]["asset_list"] asset_list["row_colour_name"] = np.where( @@ -92,6 +106,54 @@ class DataLoader: # Add in asset_list_row_id asset_list["asset_list_row_id"] = [ha_name + str(i) for i in range(0, len(asset_list))] + # Prepare the asset list + # Depending on the HA, we need to rename some columns + if ha_name == "ha_1": + asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Address - Postcode"].str.lower().str.strip() + elif ha_name == "ha_6": + asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip() + elif ha_name == "ha_14": + # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode + asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \ + asset_list["Address 2"].str.lower().str.strip() + ", " + \ + asset_list["Address 3"].str.lower().str.strip() + ", " + \ + asset_list["Address 4"].str.lower().str.strip() + ", " + \ + asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + elif ha_name == "ha_39": + # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code + asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["add_2"].str.lower().str.strip() + ", " + \ + asset_list["add_3"].str.lower().str.strip() + ", " + \ + asset_list["add_4"].str.lower().str.strip() + ", " + \ + asset_list["add_5"].str.lower().str.strip() + ", " + \ + asset_list["post_code"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip() + elif ha_name == "ha_107": + # Create matching_address by concatenating House No, Street, Town, District, Postcode + asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Street"].str.lower().str.strip() + ", " + \ + asset_list["Town"].str.lower().str.strip() + ", " + \ + asset_list["District"].str.lower().str.strip() + ", " + \ + asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + else: + raise NotImplementedError("implement me") + + if ha_name in ["ha_107"]: + asset_list["HouseNo"] = asset_list["House No"].copy() + else: + split_addresses = asset_list['matching_address'].str.split(',', expand=True) + house_numbers = split_addresses[0].str.split(' ', expand=True) + # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how + # many columns there might be + house_numbers = house_numbers.iloc[:, 0:1] + house_numbers.columns = ['HouseNo'] + + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + return asset_list def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None): @@ -165,22 +227,10 @@ class DataLoader: def merge_ha_6(asset_list, survey_list): # Correct the asset list - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way") - - # Prepare the asset list - asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().str.strip() - - split_addresses = asset_list['matching_address'].str.split(',', expand=True) - split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] - house_numbers = split_addresses['temp'].str.split(' ', expand=True) - house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"] - - asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) - del split_addresses, house_numbers + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close") + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way") # Correct the survey list survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( @@ -403,6 +453,30 @@ class DataLoader: ) +def get_epc_data(loader): + if not loader.data: + raise ValueError("Data not found - please run loader.load() first") + + property_type_lookup = {} + + for ha_name, data_assets in loader.data.items(): + # For each HA, we read pull in the data required, and store in S3 + asset_list = data_assets["asset_list"] + + # We iterate through the asset list and pull what we need + for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + searcher = SearchEpc( + address1=property_meta["No."], + postcode=property_meta["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=property_meta["Address"] + ) + searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"] + searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"] + searcher.find_property(skip_os=True) + + def app(): """ This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107. @@ -451,3 +525,23 @@ def app(): loader = DataLoader(files, use_cache) loader.load() + + # TODO: We probably need to make sure that we have all of the columns that we need + + # We load in the additional data required to perform the analysis + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + get_epc_data(loader) diff --git a/etl/testing_data/livewest_pilot.py b/etl/testing_data/livewest_pilot.py new file mode 100644 index 00000000..580c16d0 --- /dev/null +++ b/etl/testing_data/livewest_pilot.py @@ -0,0 +1,38 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import os + +import pandas as pd +from utils.s3 import save_csv_to_s3 + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) +USER_ID = 8 +PORTFOLIO_ID = 61 + + +def app(): + pilot_file = pd.DataFrame( + [ + {"address": "42, Foxes Field", "postcode": "TR18 3RJ", "Notes": None}, + {"address": "11, Cranley Gardens", "postcode": "TQ13 8UT", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/livewest_pilot_file.csv" + save_csv_to_s3( + dataframe=pilot_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "C", + "trigger_file_path": filename + } + print(body) diff --git a/etl/testing_data/the_guiness_partnership_pilot.py b/etl/testing_data/the_guiness_partnership_pilot.py new file mode 100644 index 00000000..496ea7ea --- /dev/null +++ b/etl/testing_data/the_guiness_partnership_pilot.py @@ -0,0 +1,38 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import os + +import pandas as pd +from utils.s3 import save_csv_to_s3 + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) +USER_ID = 8 +PORTFOLIO_ID = 59 + + +def app(): + pilot_file = pd.DataFrame( + [ + {"address": "10 Elm Close", "postcode": "CV37 8XL", "Notes": None}, + {"address": "21, Spring Lane", "postcode": "MK17 0QP", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/the_guiness_partnership_pilot_file.csv" + save_csv_to_s3( + dataframe=pilot_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "C", + "trigger_file_path": filename + } + print(body) From 4b73aa75b2ea9db59b15886cf68f811402767f0a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 23 Jan 2024 18:19:29 +0000 Subject: [PATCH 35/48] fixed the bug in matching ha6 assets and surveys --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 54cd7c58..63a72714 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -226,11 +226,18 @@ class DataLoader: @staticmethod def merge_ha_6(asset_list, survey_list): - # Correct the asset list - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("baggott place", "baggotts place") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("cherry tree", "cherrytree") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("maryhill close", "mary hill close") - asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("moffat way", "moffatt way") + # Correct the asset list across propertyaddress and matching_address + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Baggott Place", "Baggotts Place") + asset_list["matching_address"] = asset_list["matching_address"].str.replace("baggott place", "baggotts place") + + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Cherry Tree", "Cherrytree") + asset_list["matching_address"] = asset_list["matching_address"].str.replace("cherry tree", "cherrytree") + + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Maryhill Close", "Mary Hill Close") + asset_list["matching_address"] = asset_list["matching_address"].str.replace("maryhill close", "mary hill close") + + asset_list["propertyaddress"] = asset_list["propertyaddress"].str.replace("Moffat Way", "Moffatt Way") + asset_list["matching_address"] = asset_list["matching_address"].str.replace("moffat way", "moffatt way") # Correct the survey list survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( From 04aeaae613351c030af740b5f4d4637057bc43a2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 00:17:17 +0000 Subject: [PATCH 36/48] working on new ha batch --- backend/Property.py | 2 +- etl/eligibility/ha_15_32/app.py | 75 ++++----- .../ha_15_32/ha_analysis_batch_3.py | 144 ++++++++++++++++-- etl/epc/Record.py | 9 +- 4 files changed, 167 insertions(+), 63 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index e6ae8bbe..e527c1ea 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -68,7 +68,7 @@ class Property(Definitions): self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None self.restricted_measures = False self.year_built = epc_record.get("year_built") - self.number_of_rooms = epc_record.prepared_epc.get("number_heated_rooms") + self.number_of_rooms = epc_record.prepared_epc.get("number_habitable_rooms") self.age_band = epc_record.get("age_band") self.construction_age_band = epc_record.get("construction_age_band") self.number_of_floors = epc_record.get("number_of_floors") diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index ce216364..a68bf272 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -4,6 +4,7 @@ used by the Warmfront team, to identify which properties are eligible for ECO4 a work is being done in December 2023, prior to completion of acquisition """ import pickle +from etl.epc.Record import EPCRecord from pathlib import Path from tqdm import tqdm import pandas as pd @@ -345,48 +346,31 @@ def prepare_model_data_row( :param modelling_epc: :return: """ + + epc_records = { + 'original_epc': modelling_epc.copy(), + 'full_sap_epc': full_sap_epc.copy(), + 'old_data': old_data.copy(), + } + + prepared_epc = EPCRecord( + epc_records=epc_records, + run_mode="newdata", + cleaning_data=cleaning_data + ) + p = Property( id=property_id, postcode=modelling_epc["postcode"], address=modelling_epc["address1"], - data=modelling_epc, - old_data=old_data, - full_sap_epc=full_sap_epc + epc_record=prepared_epc ) - p.get_components(cleaned, photo_supply_lookup=photo_supply_lookup, - floor_area_decile_thresholds=floor_area_decile_thresholds) - - # THIS IS TEMP AND SHOULDN'T BE HERE - data_to_clean = p.get_model_data() - if data_to_clean["NUMBER_HEATED_ROOMS"] in ['', None]: - data_to_clean["NUMBER_HEATED_ROOMS"] = data_to_clean["NUMBER_HABITABLE_ROOMS"] - p.data["number-heated-rooms"] = data_to_clean["NUMBER_HABITABLE_ROOMS"] - - # This is temp - this should happen after scoring - cleaned_property_data = DataProcessor.apply_averages_cleaning( - data_to_clean=pd.DataFrame([dict(**data_to_clean, LOCAL_AUTHORITY=p.data["local-authority"])]), - cleaning_data=cleaning_data, - cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + p.get_components( + cleaned, photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds ) - p.set_number_lighting_outlets(cleaned_property_data) - data_processor = DataProcessor(None, newdata=True) - data_processor.insert_data(pd.DataFrame([p.get_model_data()])) - - data_processor.pre_process() - - starting_epc_data = data_processor.get_component_features(suffix="_STARTING") - ending_epc_data = data_processor.get_component_features(suffix="_ENDING") - fixed_data = data_processor.get_fixed_features() - - # We update the ending record with the recommended updates and we set lodgement date to today - ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at) - - # We simulate the impact of the retrofit using expected performance of the wall and roof, - # after retrofit. We use the minimal u-values required to meet building regulations part L - # TODO: Check the performance of the materials warmfront's installers use, particularly for - # cavity + p.create_base_difference_epc_record(cleaned_lookup=cleaned) cavity_simulation = { "recommendation_id": "-".join([property_id, "cavity"]), @@ -402,21 +386,16 @@ def prepare_model_data_row( "parts": [{"depth": 270}] } - cavity_scoring = create_recommendation_scoring_data( - property=p, - recommendation=cavity_simulation, - starting_epc_data=starting_epc_data, - ending_epc_data=ending_epc_data, - fixed_data=fixed_data, - ) + simulations = [ + [cavity_simulation], + [loft_simulation] + ] - loft_scoring = create_recommendation_scoring_data( - property=p, - recommendation=loft_simulation, - starting_epc_data=starting_epc_data, - ending_epc_data=ending_epc_data, - fixed_data=fixed_data, - ) + p.adjust_difference_record_with_recommendations(simulations) + + # Make sure we definitely have the correct data + cavity_scoring = [x for x in p.recommendations_scoring_data if "cavity" in x["id"]][0] + loft_scoring = [x for x in p.recommendations_scoring_data if "loft" in x["id"]][0] return [cavity_scoring, loft_scoring] diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 63a72714..1bb0f0c4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -460,29 +460,155 @@ class DataLoader: ) -def get_epc_data(loader): +def get_epc_data( + loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds +): if not loader.data: raise ValueError("Data not found - please run loader.load() first") - property_type_lookup = {} + property_type_lookup = { + "ha_1": { + "built_form": { + 'Mid Terrace': 'Mid-Terrace', + 'Semi-Detached': 'Semi-Detached', + 'End Terrace': 'End-Terrace', + 'Detached': 'Detached', + 'Enclosed Mid': 'Mid-Terrace', + 'Detached Local Connect': 'Detached', + } + } + } for ha_name, data_assets in loader.data.items(): # For each HA, we read pull in the data required, and store in S3 - asset_list = data_assets["asset_list"] + asset_list = data_assets["asset_list"].copy() + + # If the survey list is missing, it means we have no yet completed any surveys and therefore should only + # consider the most recent EPC + consider_penultimate_epc = data_assets["survey_list"] is None # We iterate through the asset list and pull what we need + results = [] + scoring_data = [] for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + + if ha_name == "ha_1": + property_type = property_meta["Asset Type"] + # We correct a small error + if property_type == "a": + property_type = "House" + + # Remap bedsits to flats + if property_type in ["Bedsit", "Room"]: + property_type = "Flat" + + built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None) + else: + raise NotImplementedError("Implement me") + searcher = SearchEpc( - address1=property_meta["No."], - postcode=property_meta["Postcode"], + address1=property_meta["HouseNo"], + postcode=property_meta["matching_postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key=None, - full_address=property_meta["Address"] + full_address=property_meta["matching_address"] ) - searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"] - searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"] + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) + if searcher.newest_epc.get("estimated"): + # We insert the row ID as our proxy for UPRN + searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + + # If we have a survey list, we check the penultimate, because the property might have been installed + penultimate_epc = newest_epc + if consider_penultimate_epc: + # We also want to get the penultimate epc + penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + if not penultimate_epc: + penultimate_epc = newest_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + if (not eligibility.eco4_warmfront["eligible"]) and ( + not eligibility.gbis_warmfront + ) and consider_penultimate_epc: + # We check the penultimate epc + eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + # If this is the case, we need to update the older epcs + # We don't update just to make data cleaning easier + if penultimate_epc.get("estimated") is None: + older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + + # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity + # Loft MUST be suitable + cavity_age = None + if ( + eligibility.walls["is_cavity_wall"] and + eligibility.walls["is_filled_cavity"] and + eligibility.loft["suitability"] and + eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + ): + # We check the age of the cavity and if it's particularly old, we flag it + cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) + + # Full checks + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + if eligibility.epc["uprn"] == "": + eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["asset_list_row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["asset_list_row_id"], + "uprn": eligibility.epc["uprn"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + "loft_thickness": eligibility.roof["insulation_thickness"], + "cavity_age": cavity_age, + **eligibility.walls, + **eligibility.roof, + } + ) + def app(): """ @@ -491,7 +617,7 @@ def app(): :return: """ - use_cache = False + use_cache = True files = { "ha_1": { diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 6fb4d5d9..f0bbcbfa 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -361,7 +361,7 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - if self.prepared_epc["fixed-lighting-outlets-count"] == "": + if self.prepared_epc["fixed-lighting-outlets-count"] in ["", None] + list(DATA_ANOMALY_MATCHES): # We check old EPCs and the full SAP EPC @@ -537,7 +537,7 @@ class EPCRecord: else: value = 0 else: - value = int(value) + value = int(float(value)) self.prepared_epc[attribute] = value @@ -583,9 +583,8 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if self.prepared_epc[ - 'photo-supply'] != "" \ - else None + self.prepared_epc['photo-supply'] = float(self.prepared_epc['photo-supply']) if ( + self.prepared_epc['photo-supply'] not in [None, ""]) else None def _clean_energy(self): """ From 013070073c9431db2471c5851a342e8d8779f869 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 10:59:21 +0000 Subject: [PATCH 37/48] updated cleaning of construction age band to also clean the prepared epc --- etl/epc/Record.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index f0bbcbfa..aac22618 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -618,9 +618,11 @@ class EPCRecord: if not self.prepared_epc: raise ValueError("EPC Recrod doesn not contain epc data") - self.construction_age_band = EPCDataProcessor.clean_construction_age_band( - self.prepared_epc["construction-age-band"]) - if self.construction_age_band in DATA_ANOMALY_MATCHES: + self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band( + self.prepared_epc["construction-age-band"] + ) + + if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES: if self.old_data: # Take the most recent max_datetime = max( @@ -630,15 +632,17 @@ class EPCRecord: most_recent = [old_record for old_record in self.old_data if old_record["lodgement-datetime"] == max_datetime] - self.construction_age_band = EPCDataProcessor.clean_construction_age_band( + self.prepared_epc["construction-age-band"] = EPCDataProcessor.clean_construction_age_band( most_recent[0]["construction-age-band"] ) + self.construction_age_band = self.prepared_epc["construction-age-band"] self.age_band = england_wales_age_band_lookup.get(self.construction_age_band) if (self.prepared_epc["transaction-type"] == "new dwelling") and (self.age_band is None): self.age_band = "L" self.construction_age_band = 'England and Wales: 2012 onwards' + self.prepared_epc["construction-age-band"] = self.construction_age_band if self.age_band is None: raise ValueError("age_band is missing") From f2872def6480cb000010e77431654dc62ee44f8d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 11:16:11 +0000 Subject: [PATCH 38/48] Adde None to DATA_ANOMALY_MATCHES --- etl/epc/settings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 24c23ebc..33bab190 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -43,7 +43,9 @@ DATA_ANOMALY_MATCHES = { # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. - "" + "", + # We sometimes find None values - particulatly when we produce an estimated EPC + None, } DATA_ANOMALY_SUBSTRINGS = { From 60e3221fa312c813116889d46c6276b06f0b3068 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 12:19:44 +0000 Subject: [PATCH 39/48] patching eligibility for missing rows in cleaned_lookup --- .../ha_15_32/ha_analysis_batch_3.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1bb0f0c4..14b6dfcf 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -485,7 +485,7 @@ def get_epc_data( # If the survey list is missing, it means we have no yet completed any surveys and therefore should only # consider the most recent EPC - consider_penultimate_epc = data_assets["survey_list"] is None + consider_penultimate_epc = data_assets["survey_list"] is not None # We iterate through the asset list and pull what we need results = [] @@ -669,6 +669,22 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) + # Patch to handle the a missing description + cleaned["floor-description"].extend( + [ + {'original_description': 'To external air, uninsulated (assumed)', + 'clean_description': 'To external air, no insulation', 'thermal_transmittance': None, + 'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': False, + 'is_to_external_air': True, 'is_suspended': False, 'is_solid': False, 'another_property_below': False, + 'insulation_thickness': 'none'}, + {'original_description': 'To unheated space, uninsulated (assumed)', + 'clean_description': 'To unheated space, uninsulated', 'thermal_transmittance': None, + 'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True, + 'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'another_property_below': False, + 'insulation_thickness': 'average'} + ] + ) + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) From f5d780a1b0ab920cfdcaf69a0e6d341ef865e11c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 12:59:56 +0000 Subject: [PATCH 40/48] Added back in filling of age with national average --- .../ha_15_32/ha_analysis_batch_3.py | 21 +++++++++++++++++++ etl/epc/Record.py | 4 +++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 14b6dfcf..9143df5f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -610,6 +610,27 @@ def get_epc_data( ) +def analyse_ha_data(): + """ + The approach we take within this function is the following: + For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The + characterisation can be broken down as the following: + 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria + 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to + a CIGA check + 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft + insulation + 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under + any cirsumstances, given the available data + + Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would + qualify under the strictest criteria, and mark these as potential additional opportunities. + + :return: + """ + pass + + def app(): """ This app contains the housign association analysis for HAs 1, 6, 14, 39 and 107. diff --git a/etl/epc/Record.py b/etl/epc/Record.py index aac22618..9fcf31ff 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -645,7 +645,9 @@ class EPCRecord: self.prepared_epc["construction-age-band"] = self.construction_age_band if self.age_band is None: - raise ValueError("age_band is missing") + self.age_band = "C" + self.construction_age_band = "England and Wales: 1930-1949" + self.prepared_epc["construction-age-band"] = self.construction_age_band def _clean_year_built(self): """ From d557653129a3aac4b8b45d1c5d6a14d01ac6ac8b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 14:16:33 +0000 Subject: [PATCH 41/48] patched issue with cleaned lookup --- .../ha_15_32/ha_analysis_batch_3.py | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9143df5f..85486e17 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -609,6 +609,75 @@ def get_epc_data( } ) + scoring_df = pd.DataFrame(scoring_data) + scoring_df = scoring_df.drop( + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] + ) + + model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) + + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + results_df = pd.DataFrame(results) + + predictions = all_predictions["sap_change_predictions"].copy() + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + def analyse_ha_data(): """ @@ -706,6 +775,23 @@ def app(): ] ) + # We treat unknown loft insulation as no insulation + cleaned["roof-description"].extend( + [ + {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation', + 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True, + 'is_roof_room': False, + 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, + 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'} + ] + ) + + # We patch this record because there is another property below + for x in cleaned["floor-description"]: + if x["original_description"] == '(Same dwelling below) insulated (assumed)': + x["another_property_below"] = True + x["thermal_transmittance"] = 0 + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) From edb541f3dc3dca9f03fc75b1e7e399fcf9d6790f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 14:36:46 +0000 Subject: [PATCH 42/48] patching heating controls --- backend/Property.py | 1 + etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/backend/Property.py b/backend/Property.py index e527c1ea..4d26857d 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -319,6 +319,7 @@ class Property(Definitions): attributes = [ x for x in cleaned[description] if x["original_description"] == self.data[description] ] + if len(attributes) > 1: raise ValueError("Either No attributes or multiple found for %s" % description) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 85486e17..66183599 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -786,6 +786,16 @@ def app(): ] ) + # Patch mainheatcont-description + cleaned["mainheatcont-description"].extend( + [ + {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False, + 'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False, + 'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False, + 'rate_control': False} + ] + ) + # We patch this record because there is another property below for x in cleaned["floor-description"]: if x["original_description"] == '(Same dwelling below) insulated (assumed)': From ef27d6b1640b6f2003b0d6b3c30c40ce15418486 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 24 Jan 2024 21:21:01 +0000 Subject: [PATCH 43/48] Added booleans to clean missings --- BaseUtility.py | 4 +- backend/Property.py | 9 +- etl/eligibility/Eligibility.py | 15 ++ .../ha_15_32/ha_analysis_batch_3.py | 237 +++++++++++++++++- etl/epc/Dataset.py | 33 ++- etl/epc/settings.py | 2 + 6 files changed, 288 insertions(+), 12 deletions(-) diff --git a/BaseUtility.py b/BaseUtility.py index bd2f091e..e799144d 100644 --- a/BaseUtility.py +++ b/BaseUtility.py @@ -45,7 +45,9 @@ class Definitions: # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. - "" + "", + # An older value which rarely shows up but has been seen in the data. + "UNKNOWN", } DATA_ANOMALY_SUBSTRINGS = { diff --git a/backend/Property.py b/backend/Property.py index 4d26857d..82695b75 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map from etl.solar.SolarPhotoSupply import SolarPhotoSupply from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet -from BaseUtility import Definitions +from etl.epc.settings import DATA_ANOMALY_MATCHES from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP from recommendations.recommendation_utils import ( estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows @@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT = logger = setup_logger() -class Property(Definitions): +class Property: ATTRIBUTE_MAP = { "floor-description": "floor", "hotwater-description": "hotwater", @@ -51,6 +51,8 @@ class Property(Definitions): spatial = None base_difference_record = None + DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES + def __init__(self, id, postcode, address, epc_record): self.epc_record = epc_record @@ -302,6 +304,7 @@ class Property(Definitions): self.set_basic_property_dimensions() for description, attribute in cleaned.items(): + if self.data[description] in self.DATA_ANOMALY_MATCHES: template = cleaned[description][0] fill_dict = dict(zip(template.keys(), [None] * len(template))) @@ -319,7 +322,7 @@ class Property(Definitions): attributes = [ x for x in cleaned[description] if x["original_description"] == self.data[description] ] - + if len(attributes) > 1: raise ValueError("Either No attributes or multiple found for %s" % description) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 13966655..6a5c03e1 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -233,6 +233,13 @@ class Eligibility: def room_roof_insulation(self): is_room_roof = self.roof["is_roof_room"] + if not is_room_roof: + self.room_roof = { + "suitability": False, + "thickness": None + } + return + insulation_thickness = convert_thickness_to_numeric( self.roof["insulation_thickness"], self.roof["is_pitched"], @@ -246,6 +253,14 @@ class Eligibility: def flat_roof_insulation(self): is_flat = self.roof["is_flat"] + + if not is_flat: + self.flat_roof = { + "suitability": False, + "thickness": None + } + return + insulation_thickness = convert_thickness_to_numeric( self.roof["insulation_thickness"], self.roof["is_pitched"], diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 66183599..8ee5d743 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -154,6 +154,10 @@ class DataLoader: asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + # Finally, we process property_type or built form, where needed + if ha_name == "ha_6": + asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6) + return asset_list def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None): @@ -412,6 +416,34 @@ class DataLoader: return matching_lookup + @staticmethod + def identify_built_form_ha6(property_string): + """ + Identify the built form of a property from the given string. + + :param property_string: The string describing the property + :return: The identified built form, or None if it cannot be identified + """ + # Define keywords for each built form + built_forms = { + 'Semi-Detached': ['semi detached'], + 'Detached': ['detached'], + 'Mid-Terrace': ['mid terrace', 'mid town house'], + 'End-Terrace': ['end terrace', 'end town house'] + } + + # Normalize the input string to lower case for comparison + property_string_normalized = property_string.lower() + + # Search for each built form keyword in the input string + for built_form, keywords in built_forms.items(): + for keyword in keywords: + if keyword in property_string_normalized: + return built_form + + # Return None if no built form is identified + return None + def load(self): if self.use_cache: @@ -461,7 +493,7 @@ class DataLoader: def get_epc_data( - loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True ): if not loader.data: raise ValueError("Data not found - please run loader.load() first") @@ -476,10 +508,39 @@ def get_epc_data( 'Enclosed Mid': 'Mid-Terrace', 'Detached Local Connect': 'Detached', } + }, + "ha_6": { + "property_type": { + 'HOUSE': "House", + 'GROUND FLOOR FLAT': "Flat", + 'UPPER FLOOR FLAT': "Flat", + 'MAISONETTE': "Maisonette", + 'BUNGALOW': "Bungalow", + 'WARDEN BUNGALOW': "Bungalow", + 'WARDEN FLAT': "Flat", + 'EXTRACARE SCHEME': "Flat", + } + } } + outputs = {} for ha_name, data_assets in loader.data.items(): + + if not pull_data: + # Then we retrieve the data from S3 + processed_ha_results = read_pickle_from_s3( + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + ) + + outputs[ha_name] = { + "results_df": processed_ha_results["results_df"], + "scoring_data": processed_ha_results["scoring_df"], + "nodata": processed_ha_results["nodata"] + } + continue + # For each HA, we read pull in the data required, and store in S3 asset_list = data_assets["asset_list"].copy() @@ -490,8 +551,12 @@ def get_epc_data( # We iterate through the asset list and pull what we need results = [] scoring_data = [] + nodata = [] for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + if property_meta["matching_postcode"] is None: + continue + if ha_name == "ha_1": property_type = property_meta["Asset Type"] # We correct a small error @@ -503,6 +568,9 @@ def get_epc_data( property_type = "Flat" built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None) + elif ha_name == "ha_6": + property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]] + built_form = property_meta["built_form"] else: raise NotImplementedError("Implement me") @@ -517,6 +585,10 @@ def get_epc_data( searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + if searcher.newest_epc.get("estimated"): # We insert the row ID as our proxy for UPRN searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) @@ -606,6 +678,7 @@ def get_epc_data( "cavity_age": cavity_age, **eligibility.walls, **eligibility.roof, + "is_estimated": searcher.newest_epc.get("estimated") is not None } ) @@ -619,6 +692,10 @@ def get_epc_data( model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) + # scoring_df["is_community"].value_counts() + # scoring_df[scoring_df["is_community"] == "Unknown"] + # property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze() + all_predictions = model_api.predict_all( df=scoring_df, bucket="retrofit-data-dev", @@ -678,8 +755,33 @@ def get_epc_data( } ) + eligibility_assessment = pd.DataFrame(eligibility_assessment) -def analyse_ha_data(): + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + + # We store the results in S3 as a pickle + save_pickle_to_s3( + data={ + "results_df": results_df, + "scoring_data": scoring_df, + "nodata": nodata + }, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + ) + + outputs[ha_name] = { + "results_df": results_df, + "scoring_data": scoring_df, + "nodata": nodata + } + + return outputs + + +def analyse_ha_data(outputs, loader): """ The approach we take within this function is the following: For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The @@ -697,6 +799,127 @@ def analyse_ha_data(): :return: """ + + for ha_name, datasets in outputs.items(): + + # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for + # yet + # + import random + randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0]) + inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes + inputs["asset_list"]["funding_scheme"] = None + inputs["asset_list"]["funding_scheme"] = np.where( + inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)", + inputs["asset_list"]["randomly_allocated_schemes"], + inputs["asset_list"]["funding_scheme"] + ) + + # End placholder + + results_df = datasets["results_df"].copy() + + inputs = [x for k, x in loader.data.items() if k == ha_name][0] + + analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( + columns={"row_meaning": "asset_identification_status"} + ).merge( + results_df, + how="left", + right_on="row_id", + left_on="asset_list_row_id" + ) + + # If we have a survey list, we merge this onto the results + + n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() + + properties_sold = ( + inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if + inputs["survey_list"] is not None else 0 + ) + properties_sold_eco4 = ( + properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if + properties_sold != 0 else 0 + ) + properties_sold_gbis = ( + properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if + properties_sold != 0 else 0 + ) + + # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is + # remaining + + if inputs["matched_lookup"] is not None: + analysis_data = analysis_data.merge( + inputs["matched_lookup"], how="left", on="asset_list_row_id" + ) + # Drop any rows that have a survey_list_row_id + analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] + + # We now calculate the number of remaining properties, by scheme + # TODO: We might need to tweak a bit of the knowledge + remaining_properties = analysis_data[ + analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" + ] + + remaining_properties_by_scheme = ( + remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() + ) + remaining_properties_eco4 = remaining_properties_by_scheme[ + remaining_properties_by_scheme["funding_scheme"] == "ECO4" + ]["asset_list_row_id"].values[0] + + remaining_properties_gbis = remaining_properties_by_scheme[ + remaining_properties_by_scheme["funding_scheme"] == "GBIS" + ]["asset_list_row_id"].values[0] + + # For the remaining properties, we use the results of the eligibility process to classify the property into + # one of multiple categories + # + # For properties that have been identified as ECO4 + # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D + # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties + # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have + # very old EPCs which may score lower when re-done + # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity. + # - we don't have a SAP constraint here because the EPC is (currently) showing what the property might + # actually look like after retrofit and so the EPC currently being a C or above means little, because + # the updated EPC, showing an empty cavity, could bring the property within + # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. + # - No SAP constraint, for the same reason as in category 2) + # 4) Does not look like ECO4 candidate + # + # For properties that have been identified as GBIS + # 1) Strict GBIS candidates + # 2) Properties that actually look like strict GBIS candidates + # 3) Subject to CIGA check - Filled cavity + # 4) Does not look like a GBIS candidate + + # ECO4 + # 1) We identify this if: + # - remaining_properties["eco4_eligible"] == True + # - remaining_properties[""] + remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts() + remaining_properties["eco4_message"].value_counts() + z = remaining_properties[ + (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") & + (remaining_properties["eco4_eligible"] == True) + ] + + k = z[z["property_type"] == "Flat"] + k["uprn"] + + ha_analysis_results = { + "n_properties_in_asset_list": n_properties_in_asset_list, + # ECO4 + "properties_sold_eco4": properties_sold_eco4, + "remaining_properties_eco4": remaining_properties_eco4, + # GBIS + "properties_sold_gbis": properties_sold_gbis, + "remaining_properties_gbis": remaining_properties_gbis + } + pass @@ -789,10 +1012,10 @@ def app(): # Patch mainheatcont-description cleaned["mainheatcont-description"].extend( [ - {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False, - 'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False, - 'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False, - 'rate_control': False} + {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None, + 'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None, + 'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None, + 'rate_control': None} ] ) @@ -810,4 +1033,4 @@ def app(): photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - get_epc_data(loader) + outputs = get_epc_data(loader) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index fbc7a2d2..4a159f4b 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -11,6 +11,37 @@ from recommendations.recommendation_utils import ( get_wall_type ) +# TODO: Can probably produce this in the property change app and store in S3 +BOOLEAN_VARIABLES = [ + 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', + 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home', + 'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending', + 'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid', + 'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', + 'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', + 'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', + 'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', + 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present', + 'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration', + 'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', + 'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', + 'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', + 'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending', + 'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending', + 'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending', + 'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending', + 'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending', + 'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending', + 'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending', + 'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending', + 'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending', + 'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending', + 'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending', + 'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats', + 'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network', + 'is_community_ending', 'no_individual_heating_or_community_network_ending' +] + class BaseDataset: """ @@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset): for col in missings.index: unique_values = self.df[col].unique() - if True in unique_values or False in unique_values: + if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES): self.df[col] = self.df[col].fillna(False) if "none" in unique_values: self.df[col] = self.df[col].fillna("none") diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 33bab190..87f27972 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = { "", # We sometimes find None values - particulatly when we produce an estimated EPC None, + # An older value which rarely shows up but has been seen in the data. + "UNKNOWN", } DATA_ANOMALY_SUBSTRINGS = { From 3cfb2002e41a4ec5b3120b7f5d0ac781a94f1310 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 25 Jan 2024 14:38:24 +0000 Subject: [PATCH 44/48] Handling property type for ha 107 and 39 --- .../ha_15_32/ha_analysis_batch_3.py | 126 +++++++++++++++++- 1 file changed, 119 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 8ee5d743..dfd95100 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -125,11 +125,11 @@ class DataLoader: elif ha_name == "ha_39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ - asset_list["add_2"].str.lower().str.strip() + ", " + \ - asset_list["add_3"].str.lower().str.strip() + ", " + \ - asset_list["add_4"].str.lower().str.strip() + ", " + \ - asset_list["add_5"].str.lower().str.strip() + ", " + \ - asset_list["post_code"].str.lower().str.strip() + asset_list["add_2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["add_3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["post_code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip() elif ha_name == "ha_107": # Create matching_address by concatenating House No, Street, Town, District, Postcode @@ -520,10 +520,70 @@ def get_epc_data( 'WARDEN FLAT': "Flat", 'EXTRACARE SCHEME': "Flat", } - + }, + "ha_14": { + "property_type": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + } + }, + "ha_39": { + "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, + "1st floor flat": {"property_type": "Flat", "built_form": None}, + "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"}, + "Ground floor flat": {"property_type": "Flat", "built_form": None}, + "End terrace house": {"property_type": "House", "built_form": "End-Terrace"}, + "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"}, + "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"}, + "2nd floor flat": {"property_type": "Flat", "built_form": None}, + "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"}, + "3rd floor flat": {"property_type": "Flat", "built_form": None}, + "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"}, + "Maisonette": {"property_type": "Maisonette", "built_form": None}, + "Detached house": {"property_type": "House", "built_form": "Detached"}, + "Lower ground floor flat": {"property_type": "Flat", "built_form": None}, + "Dormer bungalow": {"property_type": "Bungalow", "built_form": None}, + "Basement flat": {"property_type": "Flat", "built_form": None}, + "Cluster House": {"property_type": "House", "built_form": "Detached"}, + "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None}, + "Ground floor flat with study": {"property_type": "Flat", "built_form": None}, + "4th floor flat": {"property_type": "Flat", "built_form": None}, + "1st floor flat with study room": {"property_type": "Flat", "built_form": None}, + "2nd floor flat with study": {"property_type": "Flat", "built_form": None}, + }, + "ha_107": { + "property_type": { + "HOUSE": "House", + "BUNGALOW": "Bungalow", + "GRD FLOOR FLAT": "Flat", + "FIRST FLOOR FLAT": "Flat", + "SHELTERED BUNGALOW": "Bungalow", + "MAISONETTE": "Maisonette", + "SECOND FLOOR FLAT": "Flat", + "SHELTERED FIRST FLR": "Flat", + "SHELTERED GROUND FLR": "Flat", + "GRD FLOOR BED SIT": "House" + }, + "built_form": { + "Semi Detached": "Semi-Detached", + "Mid Terrace": "Mid-Terrace", + "End Terrace": "End-Terrace", + "Detached": "Detached", + "Detatched": "Detached", + } } } + # TODO: Sort these + # DwellingType + # UNKNOWN 395 + # SHELTERED FIRST FLR 77 + # 62 + # ROOM 4 + # GRD FLOOR BED SIT 3 + outputs = {} for ha_name, data_assets in loader.data.items(): @@ -571,11 +631,63 @@ def get_epc_data( elif ha_name == "ha_6": property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] + elif ha_name == "ha_14": + if property_meta["Asset Type Description"] == "Block - Repair": + # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address + if "room" in property_meta["Address 1"].lower(): + property_type = "House" + else: + property_type = "Flat" + + else: + property_type = property_type_lookup[ha_name]["property_type"][ + property_meta["Asset Type Description"] + ] + + built_form = None + elif ha_name == "ha_39": + + property_type_config = property_type_lookup[ha_name].get(property_meta["ConstructionStyle"], {}) + property_type = property_type_config.get("property_type", None) + built_form = property_type_config.get("built_form", None) + + if property_type is None: + # We check for the presence of room or flat + if "flat" in property_meta["matching_address"]: + property_type = "Flat" + else: + property_type = "House" + elif ha_name == "ha_107": + + dwelling_style = property_meta["Dwelling Style"] + if isinstance(dwelling_style, str): + dwelling_style = dwelling_style.strip() + + property_type = property_type_lookup[ha_name]["property_type"].get(property_meta["DwellingType"]) + built_form = property_type_lookup[ha_name]["built_form"].get(dwelling_style, None) + + if property_type is None: + if built_form in ["Semi-Detached", "Mid-Terrace", "End-Terrace", "Detached"]: + property_type = "House" + + if "flat" in property_meta["Wall Construction"].lower(): + property_type = "Flat" + + if (property_meta["DwellingType"] == "UNKNOWN") & (property_meta["Dwelling Style"] == 0): + # Hand a few specific cases + property_type = "Bungalow" + + if property_meta["Street"] == "School View": + property_type = "Bungalow" + + if property_type is None: + blah + else: raise NotImplementedError("Implement me") searcher = SearchEpc( - address1=property_meta["HouseNo"], + address1=str(property_meta["HouseNo"]), postcode=property_meta["matching_postcode"], auth_token=EPC_AUTH_TOKEN, os_api_key=None, From 5c6bac1f8a2823037b0a1ac28481f741e7110ee9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 26 Jan 2024 11:00:12 +0000 Subject: [PATCH 45/48] working on eligibility --- etl/eligibility/Eligibility.py | 49 ++++++++++---- .../ha_15_32/ha_analysis_batch_3.py | 64 ++++++++++--------- 2 files changed, 70 insertions(+), 43 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 6a5c03e1..00c72a8e 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -114,7 +114,8 @@ class Eligibility: self.loft = { "suitability": False, "thickness": None, - "reason": "roof not loft" + "reason": "roof not loft", + "thickness_classification": None } return @@ -125,18 +126,32 @@ class Eligibility: is_flat=self.roof["is_flat"] ) + if insulation_thickness <= 100: + thickness_classification = "0-100mm" + elif insulation_thickness <= 270: + thickness_classification = "100-270mm" + else: + thickness_classification = "270mm+" + if insulation_thickness <= loft_thickness_threshold: + # We produce a thiclkness classification for the loft + # 0 - 100mm insulation + # 100 - 270mm insulation + # 270mm+ insulation + self.loft = { "suitability": True, "thickness": insulation_thickness, - "reason": None + "reason": None, + "thickness_classification": thickness_classification } if insulation_thickness <= high_loft_thickness_threshold: self.loft = { "suitability": True, "thickness": insulation_thickness, - "reason": "high loft thickness but below regulation" + "reason": "high loft thickness but below regulation", + "thickness_classification": thickness_classification } return @@ -145,7 +160,8 @@ class Eligibility: self.loft = { "suitability": False, "thickness": insulation_thickness, - "reason": "existing insulation" + "reason": "existing insulation", + "thickness_classification": thickness_classification } return @@ -371,20 +387,21 @@ class Eligibility: """ current_sap = int(self.epc["current-energy-efficiency"]) - - if current_sap >= 69: - self.eco4_warmfront = { - "eligible": False, - "message": "sap too high" - } - return - self.cavity_insulation() self.loft_insulation() # make sure conditions 2 and 3 are true is_eligible = self.cavity["suitability"] & self.loft["suitability"] + if current_sap >= 69: + self.eco4_warmfront = { + "eligible": False, + "message": "sap too high", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + if post_retrofit_sap is None: if current_sap >= 55: @@ -401,7 +418,9 @@ class Eligibility: self.eco4_warmfront = { "eligible": is_eligible, - "message": message + "message": message, + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] } return @@ -409,7 +428,9 @@ class Eligibility: self.eco4_warmfront = { "eligible": is_eligible, - "message": None + "message": None, + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] } return diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index dfd95100..1212522e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -12,11 +12,9 @@ from tqdm import tqdm from backend.SearchEpc import SearchEpc from etl.eligibility.Eligibility import Eligibility from etl.eligibility.ha_15_32.app import prepare_model_data_row -from etl.epc.settings import COLUMNS_TO_MERGE_ON from backend.ml_models.api import ModelApi from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.recommendation_utils import calculate_cavity_age -from recommendation_utils import convert_thickness_to_numeric EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" @@ -576,14 +574,6 @@ def get_epc_data( } } - # TODO: Sort these - # DwellingType - # UNKNOWN 395 - # SHELTERED FIRST FLR 77 - # 62 - # ROOM 4 - # GRD FLOOR BED SIT 3 - outputs = {} for ha_name, data_assets in loader.data.items(): @@ -596,7 +586,7 @@ def get_epc_data( outputs[ha_name] = { "results_df": processed_ha_results["results_df"], - "scoring_data": processed_ha_results["scoring_df"], + "scoring_df": processed_ha_results["scoring_df"], "nodata": processed_ha_results["nodata"] } continue @@ -680,9 +670,6 @@ def get_epc_data( if property_meta["Street"] == "School View": property_type = "Bungalow" - if property_type is None: - blah - else: raise NotImplementedError("Implement me") @@ -790,7 +777,9 @@ def get_epc_data( "cavity_age": cavity_age, **eligibility.walls, **eligibility.roof, - "is_estimated": searcher.newest_epc.get("estimated") is not None + "is_estimated": searcher.newest_epc.get("estimated") is not None, + "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], + "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] } ) @@ -877,7 +866,7 @@ def get_epc_data( save_pickle_to_s3( data={ "results_df": results_df, - "scoring_data": scoring_df, + "scoring_df": scoring_df, "nodata": nodata }, bucket_name="retrofit-datalake-dev", @@ -886,7 +875,7 @@ def get_epc_data( outputs[ha_name] = { "results_df": results_df, - "scoring_data": scoring_df, + "scoring_df": scoring_df, "nodata": nodata } @@ -914,6 +903,7 @@ def analyse_ha_data(outputs, loader): for ha_name, datasets in outputs.items(): + inputs = [x for k, x in loader.data.items() if k == ha_name][0] # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for # yet # @@ -930,9 +920,6 @@ def analyse_ha_data(outputs, loader): # End placholder results_df = datasets["results_df"].copy() - - inputs = [x for k, x in loader.data.items() if k == ha_name][0] - analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( columns={"row_meaning": "asset_identification_status"} ).merge( @@ -970,19 +957,20 @@ def analyse_ha_data(outputs, loader): analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] # We now calculate the number of remaining properties, by scheme - # TODO: We might need to tweak a bit of the knowledge + # TODO: We might need to tweak a bit of the logic remaining_properties = analysis_data[ analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" - ] + ].copy() + remaining_properties["prospect_type"] = None remaining_properties_by_scheme = ( remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() ) - remaining_properties_eco4 = remaining_properties_by_scheme[ + n_remaining_properties_eco4 = remaining_properties_by_scheme[ remaining_properties_by_scheme["funding_scheme"] == "ECO4" ]["asset_list_row_id"].values[0] - remaining_properties_gbis = remaining_properties_by_scheme[ + n_remaining_properties_gbis = remaining_properties_by_scheme[ remaining_properties_by_scheme["funding_scheme"] == "GBIS" ]["asset_list_row_id"].values[0] @@ -990,7 +978,8 @@ def analyse_ha_data(outputs, loader): # one of multiple categories # # For properties that have been identified as ECO4 - # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D + # 1) Strict ECO4 candidate - Has required fabric and EPC is D or below. We consider D or below here, because + # Warmfront regularly re-surveys properties which then fall within the SAP requirement # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have # very old EPCs which may score lower when re-done @@ -1008,10 +997,25 @@ def analyse_ha_data(outputs, loader): # 3) Subject to CIGA check - Filled cavity # 4) Does not look like a GBIS candidate + remaining_eco4_df = remaining_properties[ + remaining_properties["funding_scheme"] == "ECO4" + ].copy() # ECO4 # 1) We identify this if: # - remaining_properties["eco4_eligible"] == True - # - remaining_properties[""] + + remaining_eco4_df["prospect_type"] = np.where( + remaining_eco4_df["eco4_eligible"] == True, + "strict ECO4", + remaining_eco4_df["prospect_type"] + ) + + # 2) We identify this if it has a filled cavity but meets the loft conditions + + remaining_eco4_df["prospect_type"] + + z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"] + remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts() remaining_properties["eco4_message"].value_counts() z = remaining_properties[ @@ -1026,10 +1030,10 @@ def analyse_ha_data(outputs, loader): "n_properties_in_asset_list": n_properties_in_asset_list, # ECO4 "properties_sold_eco4": properties_sold_eco4, - "remaining_properties_eco4": remaining_properties_eco4, + "n_remaining_properties_eco4": n_remaining_properties_eco4, # GBIS "properties_sold_gbis": properties_sold_gbis, - "remaining_properties_gbis": remaining_properties_gbis + "n_remaining_properties_gbis": n_remaining_properties_gbis } pass @@ -1145,4 +1149,6 @@ def app(): photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - outputs = get_epc_data(loader) + outputs = get_epc_data( + loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False + ) From b6c57c7253ec86b59ef1599489a405a9466ce505 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 26 Jan 2024 17:17:43 +0000 Subject: [PATCH 46/48] created template of code to create the ha analysis results --- etl/eligibility/Eligibility.py | 6 +- .../ha_15_32/ha_analysis_batch_3.py | 242 +++++++++++++++--- 2 files changed, 207 insertions(+), 41 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 00c72a8e..1d868338 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -177,15 +177,13 @@ class Eligibility: is_empty = (not self.walls["is_filled_cavity"]) or ( self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"] ) - is_partial_filled = ( - self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["below average"] - ) + is_partial_filled = "partial" in self.walls["clean_description"].lower() # We look for potentially under performing cavities - anything that is assumed, as built and insulated is_underperforming = ( self.walls["is_as_built"] and self.walls["insulation_thickness"] in ["average"] and self.walls["is_assumed"] ) - is_unfilled_cavity = is_cavity and is_empty + is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled) is_partial_filled_cavity = is_cavity and is_partial_filled is_underperforming_cavity = is_cavity and is_underperforming diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1212522e..1ed95a30 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -901,6 +901,7 @@ def analyse_ha_data(outputs, loader): :return: """ + ha_analysis_results = [] for ha_name, datasets in outputs.items(): inputs = [x for k, x in loader.data.items() if k == ha_name][0] @@ -917,9 +918,20 @@ def analyse_ha_data(outputs, loader): inputs["asset_list"]["funding_scheme"] ) + # TODO: Also temp, just for HA 6 + if ha_name == "ha_6": + inputs["survey_list"]["funding_scheme"] = None + inputs["survey_list"]["funding_scheme"] = np.where( + inputs["survey_list"][ + 'AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION '] == "AFFORDABLE WARMTH", + "ECO4", + "GBIS" + ) + # End placholder results_df = datasets["results_df"].copy() + analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( columns={"row_meaning": "asset_identification_status"} ).merge( @@ -929,23 +941,6 @@ def analyse_ha_data(outputs, loader): left_on="asset_list_row_id" ) - # If we have a survey list, we merge this onto the results - - n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() - - properties_sold = ( - inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if - inputs["survey_list"] is not None else 0 - ) - properties_sold_eco4 = ( - properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if - properties_sold != 0 else 0 - ) - properties_sold_gbis = ( - properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if - properties_sold != 0 else 0 - ) - # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is # remaining @@ -956,8 +951,23 @@ def analyse_ha_data(outputs, loader): # Drop any rows that have a survey_list_row_id analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] + # If we have a survey list, we merge this onto the results + n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() + + properties_sold = ( + inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if + inputs["survey_list"] is not None else pd.DataFrame(columns=["funding_scheme"]) + ) + properties_sold_eco4 = ( + properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if + (not properties_sold.empty) and ("ECO4" in properties_sold["funding_scheme"].values) else 0 + ) + properties_sold_gbis = ( + properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if + (not properties_sold.empty) and ("GBIS" in properties_sold["funding_scheme"].values) else 0 + ) + # We now calculate the number of remaining properties, by scheme - # TODO: We might need to tweak a bit of the logic remaining_properties = analysis_data[ analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" ].copy() @@ -966,6 +976,7 @@ def analyse_ha_data(outputs, loader): remaining_properties_by_scheme = ( remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() ) + n_remaining_properties_eco4 = remaining_properties_by_scheme[ remaining_properties_by_scheme["funding_scheme"] == "ECO4" ]["asset_list_row_id"].values[0] @@ -983,13 +994,17 @@ def analyse_ha_data(outputs, loader): # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have # very old EPCs which may score lower when re-done - # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity. + # 2) Meets Fabric requirements, not SAP + # Warmfront has identified the property as eligible, but the EPC is not D or below. We consider this but + # label is separately as not a strict + # 3) Subject to CIGA check - Meets loft conditions but shows a filled cavity. # - we don't have a SAP constraint here because the EPC is (currently) showing what the property might # actually look like after retrofit and so the EPC currently being a C or above means little, because # the updated EPC, showing an empty cavity, could bring the property within - # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. + # 4) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. # - No SAP constraint, for the same reason as in category 2) - # 4) Does not look like ECO4 candidate + # 5) Looks like GBIS instead + # 6) Does not look like ECO4 candidate # # For properties that have been identified as GBIS # 1) Strict GBIS candidates @@ -1000,43 +1015,156 @@ def analyse_ha_data(outputs, loader): remaining_eco4_df = remaining_properties[ remaining_properties["funding_scheme"] == "ECO4" ].copy() + + #################################### # ECO4 + #################################### + # 1) We identify this if: # - remaining_properties["eco4_eligible"] == True remaining_eco4_df["prospect_type"] = np.where( - remaining_eco4_df["eco4_eligible"] == True, + (remaining_eco4_df["eco4_eligible"] == True), "strict ECO4", remaining_eco4_df["prospect_type"] ) - # 2) We identify this if it has a filled cavity but meets the loft conditions + # 2) Meets fabric requirements + remaining_eco4_df["prospect_type"] = np.where( + ( + (remaining_eco4_df["eco4_message"] == "sap too high") & + remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + ), + "ECO4 if SAP downgrade", + remaining_eco4_df["prospect_type"] + ) - remaining_eco4_df["prospect_type"] + # 3) We identify this if it has a filled cavity but meets the loft conditions + # TODO: Consider if we should also allow 100-270mm or if we should add some slight tolerance (e.g. 150mm) + # to account for measurement error + remaining_eco4_df["prospect_type"] = np.where( + ( + remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + ), + "Filled cavity - subject to CIGA check", + remaining_eco4_df["prospect_type"] + ) - z = remaining_eco4_df[remaining_eco4_df["eco4_message"] == "sap too high"] + # 4) We identify this by ensuring the cavity if empty or partial, and the loft has between 101 and 270mm + remaining_eco4_df["prospect_type"] = np.where( + ( + remaining_eco4_df["eligibility_cavity_type"].isin(["empty", "partial"]) & + remaining_eco4_df["eligibility_loft_type"].isin(["100-270mm"]) + ), + "ECO4 prospect - empty cavity, loft insulation below regulation", + remaining_eco4_df["prospect_type"] + ) - remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts() - remaining_properties["eco4_message"].value_counts() - z = remaining_properties[ - (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") & - (remaining_properties["eco4_eligible"] == True) - ] + # 5) Looks like GBIS instead + remaining_eco4_df["prospect_type"] = np.where( + (remaining_eco4_df["gbis_eligible"] == True), + "Looks like GBIS", + remaining_eco4_df["prospect_type"] + ) - k = z[z["property_type"] == "Flat"] - k["uprn"] + # 6) This is everything else (i.e. both the cavity is full and the loft insulation is above 100mm) + remaining_eco4_df["prospect_type"] = remaining_eco4_df["prospect_type"].fillna( + "Does not look like ECO4 candidate" + ) - ha_analysis_results = { + #################################### + # GBIS + #################################### + + remaining_gbis = remaining_properties[ + remaining_properties["funding_scheme"] == "GBIS" + ].copy() + + # 1) Strict GBIS candidates + remaining_gbis["prospect_type"] = np.where( + ( + (remaining_gbis["gbis_eligible"] == True) & (remaining_gbis["eco4_eligible"] == False) + ), + "strict GBIS", + remaining_gbis["prospect_type"] + ) + + # 2) GBIS candidates that look like strict ECO4 candidates + remaining_gbis["prospect_type"] = np.where( + (remaining_gbis["eco4_eligible"] == True), + "Upgradable to ECO4", + remaining_gbis["prospect_type"] + ) + + # 3) Subject to CIGA check - Filled cavity + remaining_gbis["prospect_type"] = np.where( + ( + remaining_gbis["eligibility_cavity_type"].isin(["full"]) + ), + "Filled cavity - subject to CIGA check", + remaining_gbis["prospect_type"] + ) + + # 4) Everything else + remaining_gbis["prospect_type"] = remaining_gbis["prospect_type"].fillna( + "Does not look like GBIS candidate" + ) + + #################################### + # Surplus properties + #################################### + + # Take properties that were not identified by Warmfront and identify those that look like they would qualify + # under the strictest criteria + surplus_df = analysis_data[ + analysis_data["asset_identification_status"] != "identified potential eco works (CWI)" + ].copy() + + eco4_surplus = surplus_df[ + ( + (surplus_df["eco4_eligible"] == True) & (surplus_df["eco4_message"] == "subject to post retrofit sap") & + ( + surplus_df["eligibility_classification"].isin( + ["high confidence", "highest confidence", "medium confidence"] + ) + ) + ) + ].copy() + + gbis_surplus = surplus_df[ + ( + (surplus_df["gbis_eligible"] == True) & (surplus_df["eco4_eligible"] == False) & ( + surplus_df["eligibility_cavity_type"].isin(["empty", "partial"]) + ) + ) + ].copy() + + ha_analysis_results.append({ "n_properties_in_asset_list": n_properties_in_asset_list, + ############ # ECO4 + ############ "properties_sold_eco4": properties_sold_eco4, "n_remaining_properties_eco4": n_remaining_properties_eco4, + **remaining_eco4_df["prospect_type"].value_counts().to_dict(), + ############ # GBIS + ############ "properties_sold_gbis": properties_sold_gbis, - "n_remaining_properties_gbis": n_remaining_properties_gbis - } + "n_remaining_properties_gbis": n_remaining_properties_gbis, + **remaining_gbis["prospect_type"].value_counts().to_dict(), + ############ + # GBIS + ############ + "n_eco4_surplus": eco4_surplus.shape[0], + "n_gbis_surplus": gbis_surplus.shape[0], + }) - pass + ha_analysis_results = pd.DataFrame(ha_analysis_results) + + # Todo: create revenue figures and automate creation of excel def app(): @@ -1152,3 +1280,43 @@ def app(): outputs = get_epc_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False ) + + # for ha_name, datasets in outputs.items(): + # datasets["results_df"] = datasets["results_df"].drop( + # columns=["eligibility_cavity_type", "eligibility_loft_type"] + # ) + # + # # Re-do + # res = [] + # for _, row in tqdm(datasets["results_df"].iterrows(), total=datasets["results_df"].shape[0]): + # epc = { + # "walls-description": row["walls"], + # "roof-description": row["roof"], + # "floor-description": "", + # "tenure": "", + # "current-energy-efficiency": row["sap"], + # } + # eligibility = Eligibility(epc=epc, cleaned=cleaned) + # eligibility.check_eco4_warmfront() + # res.append( + # { + # "row_id": row["row_id"], + # "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], + # "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] + # } + # ) + # + # # Merge back on + # res = pd.DataFrame(res) + # datasets["results_df"] = datasets["results_df"].merge(res, how="left", on="row_id") + # + # # Re-save in s3 + # save_pickle_to_s3( + # data={ + # "results_df": datasets["results_df"], + # "scoring_df": datasets["scoring_df"], + # "nodata": datasets["nodata"] + # }, + # bucket_name="retrofit-datalake-dev", + # s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + # ) From 55e28942e48bb8cf55e7c95875533710d7e21ea1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 29 Jan 2024 12:13:22 +0000 Subject: [PATCH 47/48] Added automated creation of excel and added missing files to git --- etl/eligibility/Eligibility.py | 28 +- .../ha_15_32/WFT Sales data analysis.py | 665 ++++++++++++++++++ etl/eligibility/ha_15_32/cancellation.py | 113 +++ .../ha_15_32/ha_analysis_batch_3.py | 100 ++- 4 files changed, 876 insertions(+), 30 deletions(-) create mode 100644 etl/eligibility/ha_15_32/WFT Sales data analysis.py create mode 100644 etl/eligibility/ha_15_32/cancellation.py diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 1d868338..906ff594 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -128,7 +128,7 @@ class Eligibility: if insulation_thickness <= 100: thickness_classification = "0-100mm" - elif insulation_thickness <= 270: + elif insulation_thickness <= high_loft_thickness_threshold: thickness_classification = "100-270mm" else: thickness_classification = "270mm+" @@ -146,24 +146,14 @@ class Eligibility: "thickness_classification": thickness_classification } - if insulation_thickness <= high_loft_thickness_threshold: - self.loft = { - "suitability": True, - "thickness": insulation_thickness, - "reason": "high loft thickness but below regulation", - "thickness_classification": thickness_classification - } - return - - if insulation_thickness > high_loft_thickness_threshold: - # Insulation is already thick enough - self.loft = { - "suitability": False, - "thickness": insulation_thickness, - "reason": "existing insulation", - "thickness_classification": thickness_classification - } - return + # Insulation is already thick enough + self.loft = { + "suitability": False, + "thickness": insulation_thickness, + "reason": "existing insulation", + "thickness_classification": thickness_classification + } + return def cavity_insulation(self): diff --git a/etl/eligibility/ha_15_32/WFT Sales data analysis.py b/etl/eligibility/ha_15_32/WFT Sales data analysis.py new file mode 100644 index 00000000..a088fe43 --- /dev/null +++ b/etl/eligibility/ha_15_32/WFT Sales data analysis.py @@ -0,0 +1,665 @@ +import numpy as np +import pandas as pd + +ECO4_NEW_RATES = 1710 +GBIS_NEW_RATES = 600 + + +def app(): + # Load in the excel + nov_ha_data = pd.read_excel( + 'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx', + ) + # Drop rows where HA name is null + nov_ha_data = nov_ha_data.dropna(subset=["HA Name"]) + nov_ha_data["ha_number"] = nov_ha_data["HA Name"].str.extract(r"(\d+)").astype(int) + nov_ha_data = nov_ha_data.sort_values("ha_number", ascending=True) + + variance_explanations = pd.read_excel( + 'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx', + sheet_name="Variance explanations" + ) + + september_figures = pd.read_excel( + "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS SEP 23 UPDATE (2).xlsx", + sheet_name="HA Stats" + ) + + historical_invoices = pd.read_excel( + "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx", + sheet_name="Jul 22 to Oct 23" + ) + # Drop rows where installer rates is null + historical_invoices = historical_invoices[~pd.isnull(historical_invoices["INSTALLER RATES"])] + historical_invoices = historical_invoices[historical_invoices["INSTALLER RATES"] != "NA "] + # By Scheme, take a weighted mean of the INSTALLER RATES, weighted on the number of rows + n_invoices = historical_invoices.groupby(["Scheme", "INSTALLER RATES"])["Invoice number"].count().reset_index() + n_invoices = n_invoices[n_invoices["Scheme"].isin(["Eco 4", "GBIS"])] + historical_scheme_rates = n_invoices.groupby("Scheme").apply( + lambda x: np.average(x["INSTALLER RATES"], weights=x["Invoice number"]) + ).reset_index().rename(columns={0: "Historical rates"}) + + # we take just entries sales data that have sales > 0 + sales_data = nov_ha_data[nov_ha_data["Sales"] > 0] + + # We now need to adjust sales data depending on the variance explanations + sales_data = sales_data.merge( + variance_explanations[["HA", 'Which figure is correct']], + how="left", + left_on="ha_number", + right_on="HA" + ) + + def adjust_sales(row): + if pd.isnull(row["Which figure is correct"]): + return row["Sales"] + + if row["Which figure is correct"] == "HA facts & figures": + return row['No. of Tech surveys complete'] + + if row["Which figure is correct"] == "Billed amount": + return row["Sales"] + + if row["Which figure is correct"] in ["Both correct, HA facts and figures includes November", "Both correct"]: + return row["Sales"] + + raise ValueError(f"Unknown value for 'Which figure is correct': {row['Which figure is correct']}") + + # We now need to adjust sales data depending on the variance explanations + sales_data["adjusted_sales"] = sales_data.apply(lambda row: adjust_sales(row), axis=1) + + # We therefore adjust GBIS and ECO4 sales data based on adjusted sales + sales_data["adjusted_eco4_sales"] = sales_data["No. of Tech surveys complete - Eco 4"] / sales_data["Sales"] * \ + sales_data["adjusted_sales"] + + sales_data["adjusted_gbis_sales"] = sales_data["No. of Tech surveys complete - GBIS"] / sales_data["Sales"] * \ + sales_data["adjusted_sales"] + + sales_data["cancellation_rate"] = (sales_data["Sales"] - sales_data["adjusted_sales"]) / sales_data["Sales"] + + # The difference between the adjusted sales and the actual sales is the cancellation + cancellations = (sales_data["adjusted_sales"].sum() - sales_data["Sales"].sum()) / sales_data["Sales"].sum() + + # Given the cancellations, we can now adjust the expected remaining surveys + sales_data["No. of Tech surveys remaining"] = sales_data["No. of Tech surveys remaining"] * ( + 1 - sales_data["cancellation_rate"] + ) + + # We now merge on the expected values for September + sales_data = sales_data.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + + sales_data["Sept Expected ECO4"] = sales_data["Sept Expected ECO4"].fillna(0) + sales_data["Sept Expected GBIS"] = sales_data["Sept Expected GBIS"].fillna(0) + + # We calculate the ECO4 and GBIS conversion rates with the adjusted numbers + sales_data["ECO4 Conversion"] = sales_data["adjusted_eco4_sales"] / sales_data["adjusted_sales"] + sales_data["GBIS Conversion"] = sales_data["adjusted_gbis_sales"] / sales_data["adjusted_sales"] + + # We now calculate the expected remaining ECO4 and GBIS sales + # We take the number of remaining surveys and multiply by the conversion rate for each scheme, which tells us + # how many more we should expect to see + sales_data["Expected Remaining ECO4"] = sales_data["No. of Tech surveys remaining"] * sales_data["ECO4 Conversion"] + sales_data["Expected Remaining GBIS"] = sales_data["No. of Tech surveys remaining"] * sales_data["GBIS Conversion"] + + # We now produce a forecasted ECO4 and GBIS sales figure + sales_data["Forecasted ECO4 Sales"] = sales_data["adjusted_eco4_sales"] + sales_data["Expected Remaining ECO4"] + sales_data["Forecasted GBIS Sales"] = sales_data["adjusted_gbis_sales"] + sales_data["Expected Remaining GBIS"] + + # Take the columns we're interestd in + # HA # Properties Sept ECO4 Figures Sept GBIS Figures Nov Total Sales Nov ECO4 Sales Nov GBIS Sales + # Remaining Surveys ECO4 conversion GBIS conversion Forecasted ECO4 Sales Forecasted GBIS sales ECO4 Change + # GBIS Change + sales_data_formatted = sales_data[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "adjusted_sales", + "adjusted_eco4_sales", + "adjusted_gbis_sales", + "No. of Tech surveys remaining", + "ECO4 Conversion", + "GBIS Conversion", + "Forecasted ECO4 Sales", + "Forecasted GBIS Sales" + ]].rename( + columns={ + "adjusted_sales": "Oct Total Sales (adjusted for variance)", + "adjusted_eco4_sales": "Oct ECO4 Sales (adjusted for variance)", + "adjusted_gbis_sales": "Oct GBIS Sales (adjusted for variance)", + "No. of Tech surveys remaining": "Remaining Surveys", + } + ) + + # Convert columns which should be integers to integers + for col in ["ASSET LIST no.", "Remaining Surveys", "Sept Expected ECO4", "Sept Expected GBIS", + "Oct Total Sales (adjusted for variance)", "Oct ECO4 Sales (adjusted for variance)", + "Oct GBIS Sales (adjusted for variance)", "Forecasted ECO4 Sales", "Forecasted GBIS Sales"]: + sales_data_formatted[col] = sales_data_formatted[col].fillna(0) + sales_data_formatted[col] = sales_data_formatted[col].astype(int) + + # Remove HA 17 because this was EPCs only. We also remove HA33 because they do not have access to the full portfolio + sales_data_formatted = sales_data_formatted[ + ~sales_data_formatted["HA Name"].isin(["HA 17", "HA 33"]) + ] + + # September expected ECO4 and GBIS + sept_expected_eco4 = sales_data_formatted["Sept Expected ECO4"].sum() + sept_expected_gbis = sales_data_formatted["Sept Expected GBIS"].sum() + + # Completed so far + oct_eco4_sales = sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"].sum() + oct_gbis_sales = sales_data_formatted["Oct GBIS Sales (adjusted for variance)"].sum() + + # Forecasted figures + forecasted_eco4_sales = sales_data_formatted["Forecasted ECO4 Sales"].sum() + forecasted_gbis_sales = sales_data_formatted["Forecasted GBIS Sales"].sum() + + # Expected remaining sales + expected_remaining_eco4_sales = forecasted_eco4_sales - oct_eco4_sales + expected_remaining_gbis_sales = forecasted_gbis_sales - oct_gbis_sales + + # Forecast change vs September + forecasted_eco4_change = 100 * (forecasted_eco4_sales - sept_expected_eco4) / sept_expected_eco4 + forecasted_gbis_change = 100 * (forecasted_gbis_sales - sept_expected_gbis) / sept_expected_gbis + + aggregates = pd.DataFrame( + columns=["Scheme", "Sept Expected", "Oct Completed", "Forecasted Remaining Sales", "Forecasted Total Sales", + "Forecasted Change vs Sept"], + data=[ + ["ECO4", sept_expected_eco4, oct_eco4_sales, expected_remaining_eco4_sales, forecasted_eco4_sales, + forecasted_eco4_change], + ["GBIS", sept_expected_gbis, oct_gbis_sales, expected_remaining_gbis_sales, forecasted_gbis_sales, + forecasted_gbis_change], + ] + ) + + # Multiply by histoical rates to get revenue + # For ECO4, this is ~£1456 and for GBIS it's ~£432 + historical_gbis_price = historical_scheme_rates[ + historical_scheme_rates["Scheme"] == "GBIS" + ]["Historical rates"].iloc[0] + + historical_eco4_price = historical_scheme_rates[ + historical_scheme_rates["Scheme"] == "Eco 4" + ]["Historical rates"].iloc[0] + + aggregates["Sept Expected Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Sept Expected"] * historical_eco4_price, + aggregates["Sept Expected"] * historical_gbis_price + ) + + aggregates["Completed Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Oct Completed"] * historical_eco4_price, + aggregates["Oct Completed"] * historical_gbis_price + ) + + # We use the new rates for the forecasted revenue + aggregates["Forecasted Remaining Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Forecasted Remaining Sales"] * ECO4_NEW_RATES, + aggregates["Forecasted Remaining Sales"] * GBIS_NEW_RATES + ) + + # We also calculate the forecasted remaining revenue at the original price + aggregates["Forecasted Remaining Revenue (original price)"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Forecasted Remaining Sales"] * historical_eco4_price, + aggregates["Forecasted Remaining Sales"] * historical_gbis_price + ) + + aggregates["Forecasted Revenue"] = aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue"] + + # Forecasted revenue with original price + aggregates["Forecasted Revenue (original price)"] = ( + aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue (original price)"] + ) + + # Create a totals row which sums up the two rows + + forecasted_change_vs_sept = 100 * ( + aggregates["Forecasted Total Sales"].sum() - aggregates["Sept Expected"].sum() + ) / aggregates["Sept Expected"].sum() + + aggregates = pd.concat( + [ + aggregates, + pd.DataFrame( + [ + ["Total", aggregates["Sept Expected"].sum(), aggregates["Oct Completed"].sum(), + aggregates["Forecasted Remaining Sales"].sum(), aggregates["Forecasted Total Sales"].sum(), + forecasted_change_vs_sept, + aggregates["Sept Expected Revenue"].sum(), aggregates["Completed Revenue"].sum(), + aggregates["Forecasted Remaining Revenue"].sum(), + aggregates["Forecasted Remaining Revenue (original price)"].sum(), + aggregates["Forecasted Revenue"].sum(), + aggregates["Forecasted Revenue (original price)"].sum(), + ] + ], + columns=aggregates.columns + ) + ] + ) + + # For each property in the asset list, we now calculate an average conversion rate to ECO4 and GBIS + # We do this by taking the forecasted sales values for each schemes and dividing by the number of properties + + number_properties = sales_data_formatted["ASSET LIST no."].sum() + eco4_conversion_rate = forecasted_eco4_sales / number_properties + gbis_conversion_rate = forecasted_gbis_sales / number_properties + + # We also attribute a future value per property + future_eco4_value = ECO4_NEW_RATES * eco4_conversion_rate + future_gbis_value = GBIS_NEW_RATES * gbis_conversion_rate + + # We also calulate a revenue figure for the old rates + historical_eco4_value = historical_eco4_price * eco4_conversion_rate + historical_gbis_value = historical_gbis_price * gbis_conversion_rate + + # For the HAs that have not begun selling, we estimate the value of the projects + # We start with some problem HAs + + # HA 7, HA 24, HA 25 + # These HAs have no sales data, so we use the expected figures + + problem_has_data = nov_ha_data[ + (nov_ha_data["HA Name"].isin(["HA 7", "HA 24", "HA 25"])) + ].copy() + # Merge on the september expected figures + problem_has_data = problem_has_data.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + # Fill NAs + problem_has_data["Sept Expected ECO4"] = problem_has_data["Sept Expected ECO4"].fillna(0) + problem_has_data["Sept Expected GBIS"] = problem_has_data["Sept Expected GBIS"].fillna(0) + + # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates + problem_has_data["Expected ECO4 Sales"] = problem_has_data["ASSET LIST no."] * eco4_conversion_rate + problem_has_data["Expected GBIS Sales"] = problem_has_data["ASSET LIST no."] * gbis_conversion_rate + + # Filter just on columns we're interested in + problem_has_data = problem_has_data[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "ECO4", + "GBIS", + "Expected ECO4 Sales", + "Expected GBIS Sales" + ]].rename( + columns={ + "ECO4": "Nov Expected ECO4", + "GBIS": "Nov Expected GBIS", + } + ) + + # Fill NAs + problem_has_data["Nov Expected ECO4"] = problem_has_data["Nov Expected ECO4"].fillna(0) + problem_has_data["Nov Expected GBIS"] = problem_has_data["Nov Expected GBIS"].fillna(0) + + # We calculate HA level Sept, Nov expected revenue, based on historical rates and then forecasted revenue + problem_has_data["Sept Expected ECO4 Value"] = problem_has_data["Sept Expected ECO4"] * historical_eco4_price + problem_has_data["Sept Expected GBIS Value"] = problem_has_data["Sept Expected GBIS"] * historical_gbis_price + + problem_has_data["Nov Expected ECO4 Value"] = problem_has_data["Nov Expected ECO4"] * historical_eco4_price + problem_has_data["Nov Expected GBIS Value"] = problem_has_data["Nov Expected GBIS"] * historical_gbis_price + + problem_has_data["Forecasted ECO4 Revenue"] = problem_has_data["ASSET LIST no."] * future_eco4_value + problem_has_data["Forecasted GBIS Revenue"] = problem_has_data["ASSET LIST no."] * future_gbis_value + + # Totals + problem_has_data["Sept Expected Total Value"] = problem_has_data["Sept Expected ECO4 Value"] + \ + problem_has_data["Sept Expected GBIS Value"] + problem_has_data["Nov Expected Total Value"] = problem_has_data["Nov Expected ECO4 Value"] + \ + problem_has_data["Nov Expected GBIS Value"] + problem_has_data["Forecasted Total Revenue"] = problem_has_data["Forecasted ECO4 Revenue"] + \ + problem_has_data["Forecasted GBIS Revenue"] + + # We calculate a total expected value for September, November and then forecasted + problem_has_expected_eco4_value = problem_has_data["Sept Expected ECO4"].sum() * historical_eco4_price + problem_has_expected_gbis_value = problem_has_data["Sept Expected GBIS"].sum() * historical_gbis_price + problem_has_expected_total_value = problem_has_expected_eco4_value + problem_has_expected_gbis_value + + problem_has_nov_eco4_value = problem_has_data["Nov Expected ECO4"].sum() * historical_eco4_price + problem_has_nov_gbis_value = problem_has_data["Nov Expected GBIS"].sum() * historical_gbis_price + problem_has_nov_total_value = problem_has_nov_eco4_value + problem_has_nov_gbis_value + + forecasted_eco4_value = problem_has_data["ASSET LIST no."].sum() * future_eco4_value + forecasted_gbis_value = problem_has_data["ASSET LIST no."].sum() * future_gbis_value + problem_has_forecasted_total_value = forecasted_eco4_value + forecasted_gbis_value + + problem_has_summary = pd.DataFrame( + columns=["Scheme", "Sept Expected", "Nov Expected", "Forecasted"], + data=[ + ["ECO4", problem_has_expected_eco4_value, problem_has_nov_eco4_value, forecasted_eco4_value], + ["GBIS", problem_has_expected_gbis_value, problem_has_nov_gbis_value, forecasted_gbis_value], + ["Total", problem_has_expected_total_value, problem_has_nov_total_value, problem_has_forecasted_total_value] + ] + ) + + # We now also estimate the value of the remaining HAs based on historical sales performance and new rates + # We take the has that are not in the sales data + remaining_has = nov_ha_data[ + ~nov_ha_data["HA Name"].isin(sales_data_formatted["HA Name"]) + ].copy() + + # Merge on the september expected figures + remaining_has = remaining_has.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + + # We update the asset list size for HA 33, because they do not have access to the full portfolio + remaining_has.loc[remaining_has["HA Name"] == "HA 33", "ASSET LIST no."] = 20699 + # We also remove HA 17 + remaining_has = remaining_has[~remaining_has["HA Name"].isin(["HA 17"])] + + # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates + remaining_has["Expected ECO4 Sales"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate + remaining_has["Expected GBIS Sales"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate + + # Filter just on columns we're interested in + remaining_has = remaining_has[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "ECO4", + "GBIS", + ]].rename( + columns={ + "ECO4": "Nov Expected ECO4", + "GBIS": "Nov Expected GBIS", + } + ) + + remaining_has = remaining_has.fillna(0) + + # We take just HAs that had an initial september expectation for ECO4 or GBIS, or that now have a Nov expectation + remaining_has = remaining_has[ + (remaining_has["Sept Expected ECO4"] > 0) | (remaining_has["Sept Expected GBIS"] > 0) | + (remaining_has["Nov Expected ECO4"] > 0) | (remaining_has["Nov Expected GBIS"] > 0) + ] + + # Expected sales based on asset list size and conversion rate + remaining_has["Forecasted Sales ECO4"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate + remaining_has["Forecasted Sales GBIS"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate + + # Calculat the total expected value for September and November + remaining_has["Sept Expected ECO4 Value"] = remaining_has["Sept Expected ECO4"] * historical_eco4_price + remaining_has["Sept Expected GBIS Value"] = remaining_has["Sept Expected GBIS"] * historical_gbis_price + + remaining_has["Nov Expected ECO4 Value"] = remaining_has["Nov Expected ECO4"] * historical_eco4_price + remaining_has["Nov Expected GBIS Value"] = remaining_has["Nov Expected GBIS"] * historical_gbis_price + + # Calculate forecasted revenue + remaining_has["Forecasted ECO4 Revenue"] = remaining_has["ASSET LIST no."] * future_eco4_value + remaining_has["Forecasted GBIS Revenue"] = remaining_has["ASSET LIST no."] * future_gbis_value + + # We also calculate forecasted revenue with the original price + remaining_has["Forecasted ECO4 Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_eco4_value + remaining_has["Forecasted GBIS Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_gbis_value + + # Calculate totals for each scheme + remaining_has_september_eco4_sales = remaining_has["Sept Expected ECO4"].sum() + remaining_has_september_gbis_sales = remaining_has["Sept Expected GBIS"].sum() + + remaining_has_november_eco4_sales = remaining_has["Nov Expected ECO4"].sum() + remaining_has_november_gbis_sales = remaining_has["Nov Expected GBIS"].sum() + + remaining_has_forecasted_eco4_sales = remaining_has["Forecasted Sales ECO4"].sum() + remaining_has_forecasted_gbis_sales = remaining_has["Forecasted Sales GBIS"].sum() + + remaining_has_september_eco4_value = remaining_has["Sept Expected ECO4 Value"].sum() + remaining_has_september_gbis_value = remaining_has["Sept Expected GBIS Value"].sum() + + remaining_has_november_eco4_value = remaining_has["Nov Expected ECO4 Value"].sum() + remaining_has_november_gbis_value = remaining_has["Nov Expected GBIS Value"].sum() + + remaining_has_forecasted_eco4_value = remaining_has["Forecasted ECO4 Revenue"].sum() + remaining_has_forecasted_gbis_value = remaining_has["Forecasted GBIS Revenue"].sum() + + remaining_has_forecasted_eco4_value_original_price = remaining_has["Forecasted ECO4 Revenue (original price)"].sum() + remaining_has_forecasted_gbis_value_original_price = remaining_has["Forecasted GBIS Revenue (original price)"].sum() + + # Calculate the change in forecasted sales against the September expected sales + remaining_has_foecast_change_eco4 = 100 * ( + remaining_has["Forecasted Sales ECO4"].sum() - remaining_has["Sept Expected ECO4"].sum() + ) / remaining_has["Sept Expected ECO4"].sum() + + remaining_has_foecast_change_gbis = 100 * ( + remaining_has["Forecasted Sales GBIS"].sum() - remaining_has["Sept Expected GBIS"].sum() + ) / remaining_has["Sept Expected GBIS"].sum() + + # Total change + remaining_has_foecast_change_total = 100 * ( + remaining_has["Forecasted Sales ECO4"].sum() + remaining_has["Forecasted Sales GBIS"].sum() - + remaining_has["Sept Expected ECO4"].sum() - remaining_has["Sept Expected GBIS"].sum() + ) / (remaining_has["Sept Expected ECO4"].sum() + remaining_has["Sept Expected GBIS"].sum()) + + asset_list_size = remaining_has["ASSET LIST no."].sum() + + # Create a summary table of the rest with the totals for ECO4, GBIS and then a total row + remaining_has_aggregate = pd.DataFrame( + columns=["Scheme", "Asset List Size", "Sept Expected Sales", "Nov Expected Sales", "Forecasted Sales", + "Forecasted Change vs Sept", + "Sept Expected Value", "Nov Expected Value", "Forecasted Value", "Forecasted Value (original price)"], + data=[ + [ + "ECO4", asset_list_size, remaining_has_september_eco4_sales, remaining_has_november_eco4_sales, + remaining_has_forecasted_eco4_sales, remaining_has_foecast_change_eco4, + remaining_has_september_eco4_value, + remaining_has_november_eco4_value, remaining_has_forecasted_eco4_value, + remaining_has_forecasted_eco4_value_original_price + ], + [ + "GBIS", asset_list_size, remaining_has_september_gbis_sales, remaining_has_november_gbis_sales, + remaining_has_forecasted_gbis_sales, remaining_has_foecast_change_gbis, + remaining_has_september_gbis_value, + remaining_has_november_gbis_value, remaining_has_forecasted_gbis_value, + remaining_has_forecasted_gbis_value_original_price + ], + [ + "Total", asset_list_size, remaining_has_september_eco4_sales + remaining_has_september_gbis_sales, + remaining_has_november_eco4_sales + remaining_has_november_gbis_sales, + remaining_has_forecasted_eco4_sales + remaining_has_forecasted_gbis_sales, + remaining_has_foecast_change_total, + remaining_has_september_eco4_value + remaining_has_september_gbis_value, + remaining_has_november_eco4_value + remaining_has_november_gbis_value, + remaining_has_forecasted_eco4_value + remaining_has_forecasted_gbis_value, + remaining_has_forecasted_eco4_value_original_price + + remaining_has_forecasted_gbis_value_original_price + ] + ] + ) + + # Calculate pipeline value + pipeline_value = aggregates[["Scheme", "Completed Revenue", "Forecasted Remaining Revenue"]].merge( + remaining_has_aggregate[["Scheme", "Forecasted Value"]].rename( + columns={"Forecasted Value": "Forecasted Revenue, Unconfirmed HAs"} + ), how="inner", on="Scheme" + ) + + # Calculate the total + pipeline_value["Total Value"] = ( + pipeline_value["Completed Revenue"] + pipeline_value["Forecasted Remaining Revenue"] + pipeline_value[ + "Forecasted Revenue, Unconfirmed HAs"] + ) + + # TODO: Insert model figures + model_results = pd.DataFrame( + [ + { + # This one, we don't have sales data + "HA Name": "HA 15", + "Model Expected Additional ECO4 (unit level)": None, + "Model Expected Total ECO4 (unit level)": 296, + "Model Expected Additional GBIS (unit level)": None, + "Model Expected Total GBIS (unit level)": 209, + }, + { + "HA Name": "HA 16", + # Old before re-run + # "Model Expected Additional ECO4 (unit level)": 418, + # "Model Expected Total ECO4 (unit level)": 1820, + # "Model Expected Additional GBIS (unit level)": 576, + # "Model Expected Total GBIS (unit level)": 612, + + # IN the partial sales data, WFT have completed 1407 ECO4, 36 GBIS + "Model Expected Additional ECO4 (unit level)": 411 + 342 + 235, + "Model Expected Total ECO4 (unit level)": 1407 + 411 + 342 + 235, + "Model Expected Additional GBIS (unit level)": 223, + "Model Expected Total GBIS (unit level)": 36 + 223, + }, + { + "HA Name": "HA 24", + "Model Expected Additional ECO4 (unit level)": 224, + "Model Expected Total ECO4 (unit level)": 848, + "Model Expected Additional GBIS (unit level)": 552, + "Model Expected Total GBIS (unit level)": 552, + }, + { + "HA Name": "HA 25", + "Model Expected Additional ECO4 (unit level)": None, + "Model Expected Total ECO4 (unit level)": 1709 + 59, + "Model Expected Additional GBIS (unit level)": None, + "Model Expected Total GBIS (unit level)": 2004 + 107, + } + ] + ) + + sales_data_formatted["Remaining ECO4 Sales"] = ( + sales_data_formatted["Forecasted ECO4 Sales"] - sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"] + ) + + sales_data_formatted["Remaining GBIS Sales"] = ( + sales_data_formatted["Forecasted GBIS Sales"] - sales_data_formatted["Oct GBIS Sales (adjusted for variance)"] + ) + + sales_data_formatted["Completed ECO4 Revenue"] = (sales_data_formatted[ + "Oct ECO4 Sales (adjusted for variance)"] * + historical_eco4_price) + sales_data_formatted["Completed GBIS Revenue"] = (sales_data_formatted[ + "Oct GBIS Sales (adjusted for variance)"] * + historical_gbis_price) + + ha_subset_with_sales = ["HA 15", "HA 16", "HA 24"] + + has_subset_with_sales_value = sales_data_formatted[ + sales_data_formatted["HA Name"].isin(ha_subset_with_sales) + ].copy()[ + [ + "HA Name", + "Oct ECO4 Sales (adjusted for variance)", + "Oct GBIS Sales (adjusted for variance)", + "Remaining ECO4 Sales", + "Remaining GBIS Sales", + "Forecasted ECO4 Sales", + "Forecasted GBIS Sales", + "Completed ECO4 Revenue", + "Completed GBIS Revenue" + ] + ] + + has_subset_with_sales_value["Remaining ECO4 Revenue"] = has_subset_with_sales_value[ + "Remaining ECO4 Sales"] * ECO4_NEW_RATES + has_subset_with_sales_value["Remaining GBIS Revenue"] = has_subset_with_sales_value[ + "Remaining GBIS Sales"] * GBIS_NEW_RATES + + has_subset_with_sales_value["Remaining Total Revenue"] = ( + has_subset_with_sales_value["Remaining ECO4 Revenue"] + has_subset_with_sales_value["Remaining GBIS Revenue"] + ) + + model_results["Model Expected Additional ECO4 Revenue"] = ( + model_results["Model Expected Additional ECO4 (unit level)"] * ECO4_NEW_RATES + ) + + model_results["Model Expected Additional GBIS revenue"] = ( + model_results["Model Expected Additional GBIS (unit level)"] * GBIS_NEW_RATES + ) + + model_results["Model Expected Additional Total Revenue"] = ( + model_results["Model Expected Additional ECO4 Revenue"] + model_results[ + "Model Expected Additional GBIS revenue"] + ) + + # Show more columns with pandas + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + # Look at HA 16 + ha16_model = model_results[model_results["HA Name"] == "HA 16"] + has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 16"] + + # WFT: For HA 16: 4,598,190 ECO4, 57,000 GBIS + # Model: + + # Look at HA 24 + ha24_model = model_results[model_results["HA Name"] == "HA 24"] + has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 24"] + + # Look at HA 15 + ha15_data = has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 15"] + ha15_portfolio_value = ha15_data["Completed ECO4 Revenue"] + ha15_data[ + "Completed GBIS Revenue"] + ha15_data["Remaining Total Revenue"] + # # This doesn't have sales data so in the model analysis, we just value the ha as a whole + ha15_model = model_results[model_results["HA Name"] == "HA 15"] + ha15_value = ha15_model["Model Expected Total ECO4 (unit level)"].iloc[0] * ECO4_NEW_RATES + \ + ha15_model["Model Expected Total GBIS (unit level)"].iloc[0] * GBIS_NEW_RATES + + model_results["Expected ECO4 Revenue"] = model_results["Model Expected Total ECO4 (unit level)"] * ECO4_NEW_RATES + model_results["Expected GBIS Revenue"] = model_results["Model Expected Total GBIS (unit level)"] * GBIS_NEW_RATES + model_results["Expected Total Revenue"] = model_results["Expected ECO4 Revenue"] + model_results[ + "Expected GBIS Revenue"] + model_results[model_results["HA Name"].isin(["HA 15"])] + + # We now create a final excel with all of the data + # We want: + # 1) aggregates + # 2) sales_data_formatted + # 3) remaining_has_aggregate + # 4) remaining_has + # 5) problem_has_summary + + # Function to get the maximum column width + def get_col_widths(dataframe): + # First we find the maximum length of the index column + idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))]) + # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise + return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns] + + # Create a Pandas Excel writer using XlsxWriter as the engine + with pd.ExcelWriter('HA Pipeline Analysis.xlsx', engine='xlsxwriter') as writer: + # Write each dataframe to a different worksheet without the index + for df, sheet in [(aggregates, 'Forecasted Sales'), + (sales_data_formatted, 'Sales Data'), + (remaining_has_aggregate, 'Remaining HAs Value'), + (remaining_has, 'Remaining HAs data'), + (pipeline_value, 'Pipeline Value'), + (problem_has_summary, 'Problem HAs Analysis'), + (problem_has_data, 'Problem HAs Data') + + ]: + + df.to_excel(writer, sheet_name=sheet, index=False) + + # Auto-adjust columns' width + for i, width in enumerate(get_col_widths(df)): + writer.sheets[sheet].set_column(i, i, width) diff --git a/etl/eligibility/ha_15_32/cancellation.py b/etl/eligibility/ha_15_32/cancellation.py new file mode 100644 index 00000000..849add45 --- /dev/null +++ b/etl/eligibility/ha_15_32/cancellation.py @@ -0,0 +1,113 @@ +import openpyxl +import pandas as pd +import numpy as np + + +def get_excel_survey_list(workbook_path, worksheet_name=None): + survey_workbook = openpyxl.load_workbook(workbook_path) + if worksheet_name is not None: + survey_sheet = survey_workbook[worksheet_name] + else: + survey_sheet = survey_workbook.active + + survey_rows = [] + survey_colors = [] + + for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + survey_rows.append(row_data) + survey_colors.append(row_color) + + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + survey_list["row_colour"] = survey_colors + + return survey_list + + +def load_data(): + # Load for HA 16 - ECO 4 + ha16_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx') + + # Load for HA 24 - ECO 4 + ha24_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx') + + # Load for HA 25 - ECO 3 + ha25_survey_list = get_excel_survey_list( + 'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx', worksheet_name="CAVITY" + ) + + # Remove columns with None column names + ha25_survey_list = ha25_survey_list.dropna(axis=1, how='all') + + # Standardised this installation status columns + ha16_survey_list["survey_status"] = ha16_survey_list["INSTALLED OR CANCELLED"].copy() + ha16_survey_list["survey_status"] = ha16_survey_list["survey_status"].replace( + { + "NO UPDATE - CHECKED 2.10.23": "no update", + "NO UPDATE - CHECKED 18.12.23": "no update", + "INSTALLED": "installed", + "CANCELLED": "cancelled", + "LOFT STILL TO BE INSTALLED": "loft remaining", + } + ) + + ha24_survey_list["survey_status"] = ha24_survey_list["INSTALLED OR CANCELLED"].copy() + ha24_survey_list["survey_status"] = ha24_survey_list["survey_status"].replace( + { + "NO UPDATE - CHECKED 21.11.23": "no update", + "NO UPDATE - CHECKED 18.12.23": "no update", + "INSTALLED": "installed", + "CANCELLED": "cancelled", + "LOFT STILL TO BE INSTALLED": "loft remaining", + "SEE NOTES >>": "see notes", + } + ) + + # We need to prepare HA25 differently + ha25_survey_list["survey_status"] = np.where( + ha25_survey_list["row_colour"] == "FF7030A0", "installed", + np.where(ha25_survey_list["row_colour"] == "FF92D050", "installed", + np.where(ha25_survey_list["row_colour"] == "FFFF0000", "cancelled", + np.where(ha25_survey_list["row_colour"] == "FFFFFF00", "filler row - drop", + np.where(ha25_survey_list["row_colour"] == "FF38FD23", "installed", "unknown") + ) + ) + ) + ) + ha25_survey_list = ha25_survey_list[ha25_survey_list["survey_status"] != "filler row - drop"] + + # We standardise the cancellation reasons - just create a new column + ha16_survey_list["cancellation_reason"] = ha16_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy() + ha24_survey_list["cancellation_reason"] = ha24_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy() + # There's no cancellation reason for HA25 + ha25_survey_list["cancellation_reason"] = "No reason provided" + + # Combine the dataframes + ha16_survey_list["HA"] = "HA 16" + ha24_survey_list["HA"] = "HA 24" + ha25_survey_list["HA"] = "HA 25" + + cancellation_data = pd.concat( + [ + ha16_survey_list[["HA", "survey_status", "cancellation_reason"]], + ha24_survey_list[["HA", "survey_status", "cancellation_reason"]], + ha25_survey_list[["HA", "survey_status", "cancellation_reason"]] + ] + ) + + # Take just rows that we have a confirmed status for + cancellation_data = cancellation_data[~cancellation_data["survey_status"].isin(["no update", "loft remaining"])] + + return cancellation_data + + +def app(): + """ + This application is used to analyse the cancellation data provided by warmfront + :return: + """ + + # This is cancellations of jobs that completed invasive surveys and the installer could not conclude the work + sales_cancellation_data = load_data() diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1ed95a30..e94babcd 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -882,6 +882,13 @@ def get_epc_data( return outputs +def get_col_widths(dataframe): + # First we find the maximum length of the index column + idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))]) + # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise + return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns] + + def analyse_ha_data(outputs, loader): """ The approach we take within this function is the following: @@ -901,7 +908,11 @@ def analyse_ha_data(outputs, loader): :return: """ + eco4_rate = 1710 + gbis_rate = 600 + ha_analysis_results = [] + ha_revenue_results = [] for ha_name, datasets in outputs.items(): inputs = [x for k, x in loader.data.items() if k == ha_name][0] @@ -1034,7 +1045,8 @@ def analyse_ha_data(outputs, loader): ( (remaining_eco4_df["eco4_message"] == "sap too high") & remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) & - remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) & + pd.isnull(remaining_eco4_df["prospect_type"]) ), "ECO4 if SAP downgrade", remaining_eco4_df["prospect_type"] @@ -1048,7 +1060,7 @@ def analyse_ha_data(outputs, loader): remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) & remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) ), - "Filled cavity - subject to CIGA check", + "ECO4 - Filled cavity - subject to CIGA check", remaining_eco4_df["prospect_type"] ) @@ -1064,7 +1076,7 @@ def analyse_ha_data(outputs, loader): # 5) Looks like GBIS instead remaining_eco4_df["prospect_type"] = np.where( - (remaining_eco4_df["gbis_eligible"] == True), + (remaining_eco4_df["gbis_eligible"] == True) & pd.isnull(remaining_eco4_df["prospect_type"]), "Looks like GBIS", remaining_eco4_df["prospect_type"] ) @@ -1094,16 +1106,17 @@ def analyse_ha_data(outputs, loader): # 2) GBIS candidates that look like strict ECO4 candidates remaining_gbis["prospect_type"] = np.where( (remaining_gbis["eco4_eligible"] == True), - "Upgradable to ECO4", + "GBIS - Upgradable to ECO4", remaining_gbis["prospect_type"] ) # 3) Subject to CIGA check - Filled cavity remaining_gbis["prospect_type"] = np.where( ( - remaining_gbis["eligibility_cavity_type"].isin(["full"]) + remaining_gbis["eligibility_cavity_type"].isin(["full"]) & + pd.isnull(remaining_gbis["prospect_type"]) ), - "Filled cavity - subject to CIGA check", + "GBIS - Filled cavity - subject to CIGA check", remaining_gbis["prospect_type"] ) @@ -1141,30 +1154,95 @@ def analyse_ha_data(outputs, loader): ) ].copy() - ha_analysis_results.append({ + # Perform some checks to make sure we have all of the values + remaining_eco4_dict = remaining_eco4_df["prospect_type"].value_counts().to_dict() + if n_remaining_properties_eco4 != sum([v for k, v in remaining_eco4_dict.items()]): + raise ValueError( + "Number of remaining properties does not match the number of properties in remaining ECO4 dict" + ) + + remaining_gbis_dict = remaining_gbis["prospect_type"].value_counts().to_dict() + if n_remaining_properties_gbis != sum([v for k, v in remaining_gbis_dict.items()]): + raise ValueError( + "Number of remaining properties does not match the number of properties in remaining GBIS dict" + ) + + to_append = { + "ha_name": ha_name, "n_properties_in_asset_list": n_properties_in_asset_list, ############ # ECO4 ############ "properties_sold_eco4": properties_sold_eco4, "n_remaining_properties_eco4": n_remaining_properties_eco4, - **remaining_eco4_df["prospect_type"].value_counts().to_dict(), + **remaining_eco4_dict, ############ # GBIS ############ "properties_sold_gbis": properties_sold_gbis, "n_remaining_properties_gbis": n_remaining_properties_gbis, - **remaining_gbis["prospect_type"].value_counts().to_dict(), + **remaining_gbis_dict, ############ # GBIS ############ "n_eco4_surplus": eco4_surplus.shape[0], "n_gbis_surplus": gbis_surplus.shape[0], - }) + } + + ha_analysis_results.append(to_append) + + revenue_to_append = { + "ha_name": ha_name, + "£ Remaining from asset list": ( + n_remaining_properties_eco4 * eco4_rate + n_remaining_properties_gbis * gbis_rate + ), + "Of which: Strict": ( + to_append.get('strict ECO4', 0) * eco4_rate + to_append.get('strict GBIS', 0) * gbis_rate + + to_append.get('GBIS - Upgradable to ECO4', 0) * gbis_rate + ), + "Of which: Subject to CIGA": ( + to_append.get("ECO4 - Filled cavity - subject to CIGA check", 0) * eco4_rate + + to_append.get("GBIS - Filled cavity - subject to CIGA check", 0) * gbis_rate + ), + "Of which: Prospect, not perfect strict prospect": ( + to_append.get("ECO4 prospect - empty cavity, loft insulation below regulation", 0) * eco4_rate + + to_append.get("ECO4 if SAP downgrade", 0) * eco4_rate + ), + "Of which: Potential downgrade to GBIS": to_append["Looks like GBIS"] * eco4_rate, + "Of which: Does not look like prospect": ( + to_append.get("Does not look like ECO4 candidate", 0) * eco4_rate + + to_append.get("Does not look like GBIS candidate", 0) * gbis_rate + ), + "Surplus: Unidentified properties": eco4_surplus.shape[0] * eco4_rate + gbis_surplus.shape[0] * gbis_rate, + "Surplus: GBIS Updates to ECO4": to_append.get("GBIS - Upgradable to ECO4", 0) * (eco4_rate - gbis_rate) + } + + # Perform a quick check: + if revenue_to_append["£ Remaining from asset list"] - ( + revenue_to_append["Of which: Strict"] + revenue_to_append["Of which: Subject to CIGA"] + + revenue_to_append["Of which: Prospect, not perfect strict prospect"] + + revenue_to_append["Of which: Potential downgrade to GBIS"] + + revenue_to_append["Of which: Does not look like prospect"] + ) > 1: + raise ValueError("Error between top level revenue figures and breakdown - investigate me") + + ha_revenue_results.append(revenue_to_append) ha_analysis_results = pd.DataFrame(ha_analysis_results) + ha_revenue_results = pd.DataFrame(ha_revenue_results) - # Todo: create revenue figures and automate creation of excel + # Automate creation of the excel + # Create a Pandas Excel writer using XlsxWriter as the engine + with pd.ExcelWriter('HA Analysis - batch3.xlsx', engine='xlsxwriter') as writer: + # Write each dataframe to a different worksheet without the index + for df, sheet in [(ha_revenue_results, 'Total Revenue'), + (ha_analysis_results, 'By ECO4 and GBIS')]: + + df.to_excel(writer, sheet_name=sheet, index=False) + + # Auto-adjust columns' width + for i, width in enumerate(get_col_widths(df)): + writer.sheets[sheet].set_column(i, i, width) def app(): From 6a5430d214d60c0075ed0ad6c38655d34c108a1b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 29 Jan 2024 12:33:33 +0000 Subject: [PATCH 48/48] Pulling ventilation from epc_record class --- backend/Property.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/Property.py b/backend/Property.py index 82695b75..c9cad22f 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -83,7 +83,7 @@ class Property: "co2_emissions": epc_record.get("co2_emissions_current"), } self.ventilation = { - "ventilation": epc_record.prepared_epc.get("mechanical_ventilation"), + "ventilation": epc_record.get("mechanical_ventilation"), } self.solar_pv = { "solar_pv": epc_record.get("photo_supply"),