diff --git a/.gitignore b/.gitignore index 75f9cd1c..63884ad7 100644 --- a/.gitignore +++ b/.gitignore @@ -265,4 +265,7 @@ model_data/simulation_system/predictions/ .idea/misc.iml adhoc -adhoc/* \ No newline at end of file +adhoc/* + +etl-router-venv/ +refactor_datasets/ \ No newline at end of file diff --git a/backend/DbClient.py b/backend/DbClient.py new file mode 100644 index 00000000..2ee01349 --- /dev/null +++ b/backend/DbClient.py @@ -0,0 +1,7 @@ +class DbClient: + + def __init__(self): + """ + This class handles interaction with the database + """ + pass diff --git a/backend/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py new file mode 100644 index 00000000..837e76bd --- /dev/null +++ b/backend/OrdnanceSurvey.py @@ -0,0 +1,105 @@ +from functools import lru_cache +import urllib.parse +import requests +from utils.logger import setup_logger + +logger = setup_logger() + + +class OrdnanceSuveyClient: + + def __init__(self, address, postcode, api_key): + """ + This class is tasked with interaction with the ordnance survey API. + :param address: The address for the property to search for + :param postcode: The postcode for the property to search for + """ + + self.address = address + self.postcode = postcode + self.full_address = ", ".join([self.address, self.postcode]) + self.api_key = api_key + + self.results = None + + self.most_relevant_result = None + self.property_type = None + self.built_form = None + # This will be postcode and address, as returned by the ordnance survey + self.address_os = None + self.postcode_os = None + + def set_places_address(self): + """ + Given a response from the places api, this function will set the address and postcode of the property + """ + + if self.most_relevant_result is None: + raise ValueError("No results found - run get_places_api first") + + self.address_os = self.most_relevant_result["ADDRESS"] + self.postcode_os = self.most_relevant_result["POSTCODE"] + # We strip out the postcode from the address as this is already stored separately + self.address_os = self.address_os.replace(self.postcode_os, "").strip() + # Remove trailing comma + self.address_os = self.address_os.rstrip(",").strip() + # Convert to title case + self.address_os = self.address_os.title() + # Make sure postcode is upper case + self.postcode_os = self.postcode_os.upper() + + @lru_cache(maxsize=128) + def get_places_api(self): + """ + This method is tasked with getting the places api from the Ordnance Survey. + """ + + if not self.api_key: + raise ValueError("Ordnance Survey API key not specified") + + encoded_address_query = urllib.parse.quote(self.full_address) + url = (f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&key=" + f"{self.api_key}") + response = requests.get(url) + if response.status_code == 200: + data = response.json() + results = data['results'] + self.results = results + + # Extract some details about the best match + self.most_relevant_result = self.results[0]["DPA"] + + self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"]) + self.set_places_address() + + else: + logger.info("Could not find any results for the provided address and postcode") + + return {"status": response.status_code} + + def parse_classification_code(self, classification_code: str): + """ + This function will convert the classification code, returned by the OS places api, to a property type that is + compatible with the EPC database. + + The various classifications cane be found here: + https://osdatahub.os.uk/docs/places/technicalSpecification + + Under LPI Output, CLASSIFICATION_CODE is described, and a link is provided to the full table of classifications + For these purposes, we do not need the full classification as this includes non-residential properties. We only + parse the ones of interest to us + :return: + """ + + value_map = { + # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database + 'RD': {}, + 'RD02': {'property_type': 'House', 'built_form': 'Detatched'}, + 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detatched'}, + 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, + 'RD06': {'property_type': 'Flat'}, + } + + mapped = value_map.get(classification_code, {}) + self.property_type = mapped.get("property_type", "") + self.built_form = mapped.get("built_form", "") diff --git a/backend/Property.py b/backend/Property.py index e193ffbb..c784f6f2 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -9,18 +9,17 @@ from etl.epc.DataProcessor import EPCDataProcessor from etl.epc.Dataset import TrainingDataset from etl.epc.settings import LATEST_FIELD, MANDATORY_FIXED_FEATURES, POTENTIAL_COLUMNS, EFFICIENCY_FEATURES, BUILT_FORM_REMAP from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map +from etl.solar.SolarPhotoSupply import SolarPhotoSupply from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet -from epc_api.client import EpcClient from BaseUtility import Definitions from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP from recommendations.recommendation_utils import ( - estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area + estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows ) ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev') -EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN') DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None) logger = setup_logger() @@ -51,13 +50,14 @@ class Property(Definitions): spatial = None - def __init__(self, id, postcode, address1, epc_record, data=None): + def __init__(self, id, postcode, address, epc_record, data=None): self.epc_record = epc_record self.id = id + + self.address = address self.postcode = postcode - self.address1 = address1 self.data = {k.replace("_", "-"): v for k,v in epc_record.get("prepared_epc").items()} self.old_data = epc_record.get("old_data") self.property_dimensions = None @@ -112,6 +112,9 @@ class Property(Definitions): self.insulation_floor_area = None self.number_lighting_outlets = epc_record.prepared_epc.get("fixed_lighting_outlets_count") self.floor_level = None + self.number_of_windows = None + self.solar_pv_roof_area = None + self.solar_pv_percentage = None self.current_adjusted_energy = None self.expected_adjusted_energy = None @@ -177,81 +180,51 @@ class Property(Definitions): recommendation_record["walls_insulation_thickness_ending"] = "above average" recommendation_record["walls_energy_eff_ending"] = "Good" else: - if recommendation_record["walls_thermal_transmittance_ending"] is None: - raise ValueError("We should not have a None value for the u value") + wind_turbine_count = int(wind_turbine_count) - if recommendation_record["walls_insulation_thickness_ending"] is None: - recommendation_record["walls_insulation_thickness_ending"] = "none" + self.wind_turbine = { + "wind_turbine": wind_turbine_count, + } - # Update description to indicate it's insulate - if recommendation["type"] in ["solid_floor_insulation", "suspended_floor_insulation", "exposed_floor_insulation"]: - if len(recommendation["parts"]) > 1: - raise NotImplementedError("Have more than 1 floor insulation part - handle this case") + def set_count_variables(self): - recommendation_record["floor_thermal_transmittance_ending"] = recommendation["new_u_value"] - # We don't really see above average for this in the training data - recommendation_record["floor_insulation_thickness_ending"] = "average" - recommendation_record["floor_energy_eff_ending"] = "Good" - else: - if recommendation_record["floor_thermal_transmittance_ending"] is None: - raise ValueError("We should not have a None value for the u value") + """ + For EPC fields that are just counts, we'll set them here + These are fields that are integers but may contain additional values such as "" so we can't do a direct + conversion straight to an integer + :return: + """ - if recommendation_record["floor_insulation_thickness_ending"] is None: - recommendation_record["floor_insulation_thickness_ending"] = "none" + fields = { + "number_of_open_fireplaces": "number-open-fireplaces", + "number_of_extensions": "extension-count", + "number_of_storeys": "flat-storey-count", + "number_of_rooms": "number-habitable-rooms", + } - if recommendation["type"] in ["loft_insulation", "room_roof_insulation", "flat_roof_insulation"]: - recommendation_record["roof_thermal_transmittance_ending"] = recommendation["new_u_value"] + null_attributes = ["number_of_storeys", "number_of_rooms"] - parts = recommendation["parts"] - if len(parts) != 1: - raise ValueError("More than one part for roof insulation - investiage me") + for attribute, epc_field in fields.items(): + value = self.data["extension-count"] + if value == "" or value in self.DATA_ANOMALY_MATCHES: + if attribute in null_attributes: + value = None + else: + value = 0 + else: + value = int(value) - # This is based on the values we have in the training data - valid_numeric_values = [ - 12, 25, 50, 75, 100, 150, 200, 250, 270, 300, 350, 400 - ] + setattr(self, attribute, value) - proposed_depth = int(parts[0]["depth"]) - if proposed_depth not in valid_numeric_values: - # Take the nearest value for scoring - proposed_depth = min(valid_numeric_values, key=lambda x: abs(x - proposed_depth)) - - recommendation_record["roof_insulation_thickness_ending"] = str(proposed_depth) - recommendation_record["roof_energy_eff_ending"] = "Very Good" - else: - # Fill missing roof u-values - this fill is not based on recommended upgrades - if recommendation_record["roof_thermal_transmittance_ending"] is None: - raise ValueError("We should not have a None value for the u value") - - if recommendation_record["roof_insulation_thickness_ending"] is None: - recommendation_record["roof_insulation_thickness_ending"] = "none" - - if recommendation["type"] == "mechanical_ventilation": - recommendation_record["mechanical_ventilation_ending"] = 'mechanical, extract only' - - if recommendation["type"] == "sealing_open_fireplace": - recommendation_record["number_open_fireplaces_ending"] = 0 - - if recommendation["type"] == "low_energy_lighting": - recommendation_record["low_energy_lighting_ending"] = 100 - recommendation_record["lighting_energy_eff_starting"] = "Very Good" - - if recommendation["type"] not in [ - "mechanical_ventilation", "sealing_open_fireplace", "low_energy_lighting", - "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation", - "loft_insulation", "room_roof_insulation", "flat_roof_insulation", - "solid_floor_insulation", "suspended_floor_insulation", "exposed_floor_insulation" - ]: - raise NotImplementedError("Implement me") - - return recommendation_record - - - def get_components(self, cleaned): + def get_components(self, cleaned, photo_supply_lookup, floor_area_decile_thresholds): """ Given the cleaning that has been performed, we'll use this to identify the property components, from roof to walls to windows, heating and hot water :param cleaned: This is the dictionary of components found in cleaner.cleaned + :param photo_supply_lookup: This is the lookup table for the photo supply, used to estimate the percentage + of the roof that is suitable for solar panels + :param floor_area_decile_thresholds: This is the decile thresholds for the floor area, used in estimating the + solar pv roof area :return: """ @@ -301,6 +274,10 @@ class Property(Definitions): self.set_wall_type() self.set_floor_type() self.set_floor_level() + self.set_windows_count() + self.set_solar_panel_area( + photo_supply_lookup=photo_supply_lookup, floor_area_decile_thresholds=floor_area_decile_thresholds + ) def set_spatial(self, spatial: pd.DataFrame): """ @@ -368,7 +345,7 @@ class Property(Definitions): """ Utility function for usage in the lambda, for preparing the _rating fields """ - return rating_lookup[field].value if field not in cls.DATA_ANOMALY_MATCHES else None + return rating_lookup[field].value if (field not in cls.DATA_ANOMALY_MATCHES) and (field is not None) else None def get_property_details_epc(self, portfolio_id: int, rating_lookup): @@ -409,6 +386,7 @@ class Property(Definitions): "primary_energy_consumption": self.energy["primary_energy_consumption"], "co2_emissions": self.energy["co2_emissions"], "adjusted_energy_consumption": self.current_adjusted_energy, + "estimated": self.data.get("estimated", False) } return property_details_epc @@ -664,7 +642,7 @@ class Property(Definitions): :return: """ - if self.data["fixed-lighting-outlets-count"] == "": + if self.data["fixed-lighting-outlets-count"] in [None, ""]: # We check old EPCs and the full SAP EPC @@ -693,3 +671,52 @@ class Property(Definitions): """ self.current_adjusted_energy = current_adjusted_energy self.expected_adjusted_energy = expected_adjusted_energy + + def set_windows_count(self): + """ + Using the estimate_windows function, this method will set the number of windows in the property + :return: + """ + + self.number_of_windows = estimate_windows( + property_type=self.data["property-type"], + built_form=self.data["built-form"], + construction_age_band=self.construction_age_band, + floor_area=self.floor_area, + number_habitable_rooms=self.number_of_rooms, + extension_count=float(self.data["extension-count"]), + ) + + def set_solar_panel_area(self, photo_supply_lookup, floor_area_decile_thresholds): + """ + Sets the approximate area of the solar panels + :return: + """ + + if (self.insulation_floor_area is None) and (self.pitched_roof_area is None): + raise ValueError( + "Need to set insulation floor area and pitched roof area before setting solar pv roof area" + ) + + photo_supply_matched = SolarPhotoSupply.filter_photo_supply_lookup( + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds, + tenure=self.data["tenure"], + built_form=self.data["built-form"], + property_type=self.data["property-type"], + construction_age_band=self.construction_age_band, + is_flat=self.roof["is_flat"], + is_pitched=self.roof["is_pitched"], + is_roof_room=self.roof["is_roof_room"], + floor_area=self.floor_area + ) + + percentage_of_roof = photo_supply_matched["photo_supply_median"].mean() + percentage_of_roof = percentage_of_roof / 100 + + self.solar_pv_roof_area = ( + self.insulation_floor_area * percentage_of_roof if self.roof["is_flat"] else + self.pitched_roof_area * percentage_of_roof + ) + + self.solar_pv_percentage = percentage_of_roof diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 16c2a8c8..d69d8d86 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -1,12 +1,114 @@ import os import time +import re + +import usaddress +import pandas as pd +import numpy as np from epc_api.client import EpcClient +from backend.OrdnanceSurvey import OrdnanceSuveyClient +from BaseUtility import Definitions from utils.logger import setup_logger from typing import List from fuzzywuzzy import process logger = setup_logger() +vartypes = { + 'low-energy-fixed-light-count': "Int64", + # 'address': 'str', + # 'uprn-source': 'str', + 'floor-height': 'float', + 'heating-cost-potential': 'float', + 'unheated-corridor-length': 'float', + 'hot-water-cost-potential': 'float', + 'construction-age-band': 'str', + 'potential-energy-rating': 'str', + 'mainheat-energy-eff': 'str', + 'windows-env-eff': 'str', + 'lighting-energy-eff': 'str', + 'environment-impact-potential': "Int64", + 'glazed-type': 'str', + 'heating-cost-current': 'float', + 'address3': 'str', + 'mainheatcont-description': 'str', + 'sheating-energy-eff': 'str', + 'property-type': 'str', + 'local-authority-label': 'str', + 'fixed-lighting-outlets-count': "Int64", + 'energy-tariff': 'str', + 'mechanical-ventilation': 'str', + 'hot-water-cost-current': 'str', + 'county': 'str', + 'postcode': 'str', + 'solar-water-heating-flag': 'str', + 'constituency': 'str', + 'co2-emissions-potential': 'float', + 'number-heated-rooms': 'float', + 'floor-description': 'str', + 'energy-consumption-potential': 'float', + 'local-authority': 'str', + 'built-form': 'str', + 'number-open-fireplaces': "Int64", + 'windows-description': 'str', + 'glazed-area': 'str', + # 'inspection-date': str, + 'mains-gas-flag': 'str', + 'co2-emiss-curr-per-floor-area': 'float', + 'address1': 'str', + 'heat-loss-corridor': 'str', + 'flat-storey-count': "Int64", + 'constituency-label': 'str', + 'roof-energy-eff': 'str', + 'total-floor-area': 'float', + 'building-reference-number': 'str', + 'environment-impact-current': 'float', + 'co2-emissions-current': 'float', + 'roof-description': 'str', + 'floor-energy-eff': 'str', + 'number-habitable-rooms': 'float', + 'address2': 'str', + 'hot-water-env-eff': 'str', + 'posttown': 'str', + 'mainheatc-energy-eff': 'str', + 'main-fuel': 'str', + 'lighting-env-eff': 'str', + 'windows-energy-eff': 'str', + 'floor-env-eff': 'str', + 'sheating-env-eff': 'str', + 'lighting-description': 'str', + 'roof-env-eff': 'str', + 'walls-energy-eff': 'str', + 'photo-supply': 'float', + 'lighting-cost-potential': 'float', + 'mainheat-env-eff': 'str', + 'multi-glaze-proportion': 'float', + 'main-heating-controls': 'str', + # 'lodgement-datetime', + 'flat-top-storey': 'str', + 'current-energy-rating': 'str', + 'secondheat-description': 'str', + 'walls-env-eff': 'str', + 'transaction-type': 'str', + # 'uprn': "Int64", + 'current-energy-efficiency': 'float', + 'energy-consumption-current': 'float', + 'mainheat-description': 'str', + 'lighting-cost-current': 'float', + # 'lodgement-date', + 'extension-count': "Int64", + 'mainheatc-env-eff': 'str', + 'lmk-key': 'str', + 'wind-turbine-count': "Int64", + 'tenure': 'str', + 'floor-level': 'str', + 'potential-energy-efficiency': "Int64", + 'hot-water-energy-eff': 'str', + 'low-energy-lighting': 'float', + 'walls-description': 'str', + 'hotwater-description': 'str' +} + class SearchEpc: """ @@ -38,53 +140,127 @@ class SearchEpc: self, address1: str, postcode: str, - address2: str = None, - address3: str = None, - address4: str = None, - max_retries: int = None + auth_token: str, + os_api_key: str, + full_address: str | None = None, + max_retries: int = None, + uprn: [int, None] = None, + size=None, + property_type=None, ): """ Address lines 1 and postcode are mandatory fields. The other address lines are optional but can be used to find the epc for the home, if address1 and postcode are insufficient :param address1: string, propery's address line 1 :param postcode: string, propery's postcode - :param address2: string, optional, propery's address line 2 - :param address3: string, optional, propery's address line 3 - :param address4: string, optional, propery's address line 4 + :param full_address: string, optional parameter, the full address of the property + :param max_retries: int, optional, number of retries to make when searching the api + :param uprn: int, optional, the uprn of the property + :param size: int, optional, the number of results to return. If not provided, defaults to 25 which is the api's + default + :param property_type: str, optional, the property type of the property, if known before hand """ self.address1 = address1 self.postcode = postcode - self.address2 = address2 - self.address3 = address3 - self.address4 = address4 + self.full_address = full_address + self.uprn = uprn + self.house_number = self.get_house_number(self.address1) + self.numeric_house_number = self.extract_numeric_housenumber_part(self.house_number) self.max_retries = max_retries if max_retries is not None else self.MAX_RETRIES - self.client = EpcClient(auth_token=os.getenv("EPC_AUTH_TOKEN")) + self.client = EpcClient(auth_token=auth_token) + self.ordnance_survey_client = OrdnanceSuveyClient( + address=self.address1, postcode=self.postcode, api_key=os_api_key + ) self.data = None + self.newest_epc = None + self.older_epcs = None + self.full_sap_epc = None - def search(self): + # These are the address and postcode values, which we store in the database + self.address_clean = None + self.postcode_clean = None + + self.size = size if size is not None else 25 + + self.property_type = property_type + + @classmethod + def get_house_number(cls, address: str) -> str | None: + """ + This method will use the usaddress library to parse an address and extract the house number + :return: + """ + + parsed = usaddress.parse(address) + parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")] + parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None + + if parsed_house_number is None: + # Because usaddress isn't optimal for parsing addresses with some prefixes such as 'Flat', + # we also add a custom approach + + # Pattern to look for 'Flat' or 'Apartment' followed by a number, or just a number at the beginning + pattern = r'(?i)(?:flat|apartment)\s*(\d+)|^\s*(\d+)' + + match = re.search(pattern, address) + + if match: + # Return the first non-None group found + return next(g for g in match.groups() if g is not None) + else: + return None + + # Remove training commas + parsed_house_number = parsed_house_number.replace(",", "") + + return parsed_house_number + + @staticmethod + def extract_numeric_housenumber_part(house_number: str | None) -> int | None: + # Regular expression to find the first occurrence of one or more digits + + if house_number is None: + return None + + match = re.search(r'\d+', house_number) + + if match: + return int(match.group()) + else: + return None + + def get_epc(self, params=None, size=None): # Get the EPC data with retries + size = size if size is not None else self.size + if params is None: + if self.uprn: + params = {"uprn": self.uprn} + else: + params = {"address": self.address1, "postcode": self.postcode} for retry in range(self.max_retries): try: - response = self.client.domestic.search( - params={"address": self.address1, "postcode": self.postcode} - ) + + if "uprn" in params: + # We use the direct call method inside, since we need to implement uprn as a valid + # parameter for the search function + url = os.path.join(self.client.domestic.host, "search") + response = self.client.domestic.call(method="get", url=url, params=params) + else: + response = self.client.domestic.search(params=params, size=size) if response: self.data = response return self.SUCCESS if retry > 0: - print("Failed previous attempt but retry successful") + logger.info("Failed previous attempt but retry successful") # If we got nothing, final try if not response: - # TODO: Make a call to OS uprn service and get the address' uprn, just in case there is an - # issue with how we are searching the api - return { "status": 204, "message": "no data", @@ -127,7 +303,6 @@ class SearchEpc: if len(uprns) == 1: return rows - logger.error("Multiple UPRNS found - we should use an alternate method of searching - TODO") if property_type is not None: # We can do a filter on the property type rows_filtered = [r for r in rows if r["property-type"] == property_type] @@ -147,7 +322,24 @@ class SearchEpc: return rows - def retrieve(self, property_type=None, address=None): + @staticmethod + def format_address(newest_epc): + """ + Format address and postcode for storage in the database + """ + postcode = newest_epc["postcode"] + address = newest_epc["address"] + + # Format them + address = address.replace(postcode, "").strip() + address = address.rstrip(",").strip() + address = address.title() + + postcode = postcode.upper() + + return address, postcode + + def extract_epc_data(self, address=None): """ Given a successful search, this method will format the data and return it @@ -163,7 +355,7 @@ class SearchEpc: # Firstly, we should only have 1 urpn so if we have multiple, we'll need to filter down the # property further - rows = self.filter_rows(rows, property_type=property_type, address=None) + rows = self.filter_rows(rows, property_type=self.property_type, address=None) rows = self.filter_rows(rows, property_type=None, address=address) # We now check for a full sap epc: @@ -173,7 +365,26 @@ class SearchEpc: # Finally, we identify the newest epc and the rest, and then return newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows) - return newest_epc, older_epcs, full_sap_epc + # Retrieve postcode and address + address_epc, postcode_epc = self.format_address(newest_epc=newest_epc) + + # Ge the uprn from the newest record for this home + uprns = {r["uprn"] for r in rows if r["uprn"]} + # We can sometimes have no uprn for a property + if (len(uprns) == 0) and len(rows) > 0: + logger.warning("Found data but missing uprn") + elif len(uprns) != 1: + # There is a possibility that we have multiple UPRNs for a single property, which is an error + addresses = {r["address"] for r in rows} + if len(addresses) == 1: + # Take the uprn from the most recent + uprns = {newest_epc["uprn"]} + else: + raise ValueError("Multiple UPRNs found - investigate me") + + uprn = uprns.pop() if uprns else None + + return newest_epc, older_epcs, full_sap_epc, address_epc, postcode_epc, uprn @staticmethod def filter_newest_epc(list_of_epcs: List): @@ -186,8 +397,334 @@ class SearchEpc: return {}, [] if len(newest_response) != 1: - raise Exception("More than one result found for this address - investigate me") + # It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that + # were lodged at the exact same time. In this case, we will take the first one + newest_response = [newest_response[0]] older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]] return newest_response[0], older_epcs + + @staticmethod + def _get_epc_mode(col: str, epc_data: pd.DataFrame): + """ + Simple method to extract the mode value from the EPC data + :param col: name of the column to take the mode of + :param epc_data: pandas dataframe of epc data + """ + + mode_value = epc_data[[col]].mode(dropna=True) + if len(mode_value) != 1: + raise NotImplementedError("TODO: Handle multiple modes") + mode_value = mode_value.iloc[0][col] + + return mode_value + + def fetch_nearby_epcs( + self, initial_postcode: str, + lmks_to_drop: list[str] | None = None, + built_form: str = "", + property_type: str = "" + ): + """ + Fetches and processes EPC data for a given initial postcode, applying successive trimming + to the postcode and filtering the data until a non-empty result set is found. + + The function queries the EPC API with the provided postcode, and if no data is found or + if the data doesn't meet certain criteria, it progressively shortens the postcode by + removing the last character and retries the query. This process continues until a valid + set of EPC data is obtained or the postcode is exhausted. + + Additional filtering is applied to the obtained EPC data based on 'lmk-key', 'built-form', + and 'property-type'. The data is also processed to extract and numerically interpret house + numbers, calculate house number distances, and apply weights based on these distances. + + :param initial_postcode: The initial full postcode for the EPC data query. + :param lmks_to_drop: List of 'lmk-key' values to be excluded from the EPC data. + :param built_form: The 'built-form' value to be used for filtering the EPC data. + :param property_type: The 'property-type' value to be used for filtering the EPC data. + :return: + """ + + property_type_api_map = { + "Bungalow": "bungalow", + "Flat": "flat", + "House": "house", + "Maisonette": "maisonette", + "Park home": "park home", + } + + postcode = initial_postcode + while postcode: + # Fetch data from EPC API + params = {"postcode": postcode} + if property_type: + params["property-type"] = property_type_api_map[property_type] + + # We take the 20 nearest homes of the relevant type, so not to pull in too many irrelevant homes + epc_response = self.get_epc(params=params, size=100) + + if epc_response["status"] == 200: + epc_data = pd.DataFrame(self.data["rows"]) + + if lmks_to_drop is not None: + epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)] + + if not epc_data.empty: + # Further processing of the EPC data + epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed') + epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1) + epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1)) + epc_data["numeric_house_number"] = epc_data["house_number"].apply( + lambda house_num: self.extract_numeric_housenumber_part(house_num) + ) + + if self.numeric_house_number is None: + # If we don't have a house number, we treat all weights as equal + epc_data["weight"] = 1 + else: + epc_data["house_number_distance"] = abs( + epc_data["numeric_house_number"] - self.numeric_house_number + ) + # # We add 1, just in case we have a 0 weight (e.g. comparing house number 7a to 7b, or 9A to 9) + # epc_data["weight"] = 1 / (epc_data["house_number_distance"] + 1) + # # If we have a home without a house number, fill that weight with average + # epc_data["weight"] = epc_data["weight"].fillna(epc_data["weight"].mean()) + # # Finally, we might not have any house numbers whatsoever so everything could be + # # missing, so we fill with 1 + # epc_data["weight"] = epc_data["weight"].fillna(1) + # TODO: Testing + # If the postcode is different from the initial postcode, it doesn't make sense to have + # any weightings + if all(pd.isnull(epc_data["house_number_distance"])) or (postcode != initial_postcode): + epc_data["weight"] = 1 + else: + epc_data["weight"] = 1 / np.sqrt(epc_data["house_number_distance"] + 1) + epc_data["weight"] = epc_data["weight"].fillna(epc_data["weight"].mean()) + + estimation_property_type = self._estimate_str( + key="property-type", estimation_data=epc_data + ) if property_type == "" else property_type + + epc_built_form = self._estimate_str( + key="built-form", + estimation_data=epc_data[epc_data["property-type"] == estimation_property_type] + ) + + if built_form == "Semi-Detached" and epc_built_form in ["End-Terraced", "Mid-Terraced"]: + estimation_built_form = "End-Terraced" + elif (built_form == "") or (pd.isnull(built_form)): + estimation_built_form = epc_built_form + else: + estimation_built_form = built_form + + # We handle some edge cases experiences with maisonettes - if built form is detatched, just filter + # on maisonette + # We also add some additional logic for Park homes, because they are far less common than other + # property types + + is_maisonette_with_bad_built_form = (estimation_property_type == "Maisonette") & ( + estimation_built_form in ["Detached", "Semi-Detached"] + ) + + is_park_home_without_built_form = (estimation_property_type == "Park home") & ( + sum(epc_data["built-form"] == estimation_built_form) == 0 + ) + + has_missing_built_form = not estimation_built_form + + if is_maisonette_with_bad_built_form or is_park_home_without_built_form or has_missing_built_form: + epc_data = epc_data[epc_data["property-type"] == estimation_property_type] + else: + epc_data = epc_data[ + (epc_data["built-form"] == estimation_built_form) & ( + epc_data["property-type"] == estimation_property_type) + ] + + if not epc_data.empty: + return epc_data # Return the filtered data if it's not empty + + # Shorten the postcode by one character for the next iteration + postcode = postcode[:-1].rstrip() + + # If loop finishes without a valid response, raise an exception + raise Exception("Unable to find postcode data after trimming - investigate me") + + def estimate_epc(self, property_type, built_form, lmks_to_drop=None): + """ + For a property that does not have an EPC, we retrieve the EPC data for the closest properties + and estimate the EPC for the property in question. + + Note - do we have postcodes with just a single address? We would need to use a different approach + to find the closest homes + :param property_type: This is the property type of the property we are estimating, that can be retrieved from + the ordnance survey api + :param built_form: This is the built form of the property we are estimating, that can be retrieved from + the ordnance survey api + :param lmks_to_drop: This is a list of LMK keys that should be dropped from the estimation process. This + is used as an override for testing, to drop EPCs for the property we are testing + :return: + """ + + # From the ordnance survey data, we want to determine the property type and then use only similar property + # types for the estimation process + epc_data = self.fetch_nearby_epcs( + initial_postcode=self.postcode, + lmks_to_drop=lmks_to_drop, + built_form=built_form, + property_type=property_type + ) + + # For each attribute, we need to determine the datatype and use an appropriate method + # to estimate. + estimated_epc = {} + for key, vartype in vartypes.items(): + epc_data[key] = np.where(pd.isnull(epc_data[key]), None, epc_data[key]) + epc_data[key] = np.where(epc_data[key] == "", None, epc_data[key]) + estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy() + estimation_data = estimation_data[~pd.isnull(estimation_data[key])] + estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)] + if vartype == "Int64": + # We have some edge cases where we get the error "invalid literal for int() with base 10: '1.0'" + # so this handles this + estimation_data[key] = estimation_data[key].astype(float).astype(vartype) + else: + estimation_data[key] = estimation_data[key].astype(vartype) + + if estimation_data.shape[0] == 0: + estimated_epc[key] = None + continue + + if vartype == "Int64": + estimated_value = self._estimate_int(estimation_data, key) + elif vartype == "float": + estimated_value = self._estimate_float(estimation_data, key) + elif vartype == "str": + estimated_value = self._estimate_str(estimation_data, key) + else: + raise NotImplementedError("estimation method not implemented for type") + + estimated_epc[key] = estimated_value + + # Insert an estimated lodgement datetime, with a weighted average + estimated_epc["lodgement-datetime"] = self.calculate_weighted_lodgement_datetime(epc_data=epc_data) + # Extract logement date + estimated_epc["lodgement-date"] = estimated_epc["lodgement-datetime"].strftime("%Y-%m-%d") + + estimated_epc["postcode"] = self.postcode + estimated_epc["uprn"] = self.uprn + estimated_epc["address"] = self.full_address + # Indicate that this epc was estimated + estimated_epc["estimated"] = True + + return estimated_epc + + @staticmethod + def calculate_weighted_lodgement_datetime(epc_data): + numeric_dates = pd.to_datetime(epc_data['lodgement-datetime']).view('int64') + + # Calculate the weighted sum of dates + weighted_sum = (numeric_dates * epc_data['weight']).sum() + + # Calculate the sum of weights + total_weights = epc_data['weight'].sum() + + # Calculate the weighted mean in numeric format + weighted_mean_numeric = weighted_sum / total_weights + + # Convert the numeric weighted mean back to datetime + weighted_mean_datetime = pd.to_datetime(weighted_mean_numeric) + + return weighted_mean_datetime + + @staticmethod + def _estimate_int(estimation_data, key): + return round(np.average(a=estimation_data[key], weights=estimation_data["weight"])) + + @staticmethod + def _estimate_float(estimation_data, key): + return round(np.average(a=estimation_data[key], weights=estimation_data["weight"]), 2) + + @staticmethod + def _estimate_str(estimation_data, key): + agg = estimation_data.groupby(key)["weight"].sum().reset_index() + agg = agg[agg["weight"] == agg["weight"].max()] + if agg.shape[0] != 1: + # If we have multiple modes, we take the more recent data on average + recent_grouped = estimation_data[ + estimation_data[key].isin(agg[key].values) + ].groupby(key)["lodgement-datetime"].mean() + + newest_group = recent_grouped.idxmax() + return newest_group + + return agg[key].values[0] + + def find_property(self, skip_os=False): + """ + This method will attempt to identify a property. It will, at first, use the EPC api to try and + find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to + find the UPRN of the address. + + Because no result may have been provided by the EPC api because of formatting issues with the address, + if the ordnance survey api is used and the uprn retrieved, the EPC api is queried again with the UPRN, just + as a final check to see if there is any EPC data. + + If there is no EPC data, the epc data will be estimated based on the surrounding properties + """ + + # Step 1: use the epc api to find the property and uprn + response = self.get_epc() + + if response["status"] == 200: + ( + self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean, self.uprn + ) = self.extract_epc_data(address=self.full_address) + return + + # Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn + if skip_os: + if self.ordnance_survey_client.property_type is not None: + # We can try and estimate + estimated_epc = self.estimate_epc( + property_type=self.ordnance_survey_client.property_type, + built_form=self.ordnance_survey_client.built_form + ) + self.newest_epc = estimated_epc + self.older_epcs = [] + self.full_sap_epc = {} + + # Finally, set a standardised address 1 and postcode + self.address_clean = self.ordnance_survey_client.address_os + self.postcode_clean = self.ordnance_survey_client.postcode_os + return + + os_response = self.ordnance_survey_client.get_places_api() + + if os_response["status"] != 200: + # Investigate this if it happens + raise Exception("Unable to find property - investigate me") + + # Step 3: Now that we have a urpn, do another check against the epc api, this time searching with the uprn + self.uprn = self.ordnance_survey_client.most_relevant_result["UPRN"] + response = self.get_epc() + if response["status"] == 200: + ( + self.newest_epc, self.older_epcs, self.full_sap_epc, self.address_clean, self.postcode_clean, self.uprn + ) = self.extract_epc_data() + return + + # Step 4: If we still don't have an EPC, we estimate the EPC data + self.full_address = self.ordnance_survey_client.most_relevant_result["ADDRESS"] + estimated_epc = self.estimate_epc( + property_type=self.ordnance_survey_client.property_type, + built_form=self.ordnance_survey_client.built_form + ) + self.newest_epc = estimated_epc + self.older_epcs = [] + self.full_sap_epc = {} + + # Finally, set a standardised address 1 and postcode + self.address_clean = self.ordnance_survey_client.address_os + self.postcode_clean = self.ordnance_survey_client.postcode_os + return diff --git a/backend/app/config.py b/backend/app/config.py index 22621972..764bddf5 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -13,6 +13,7 @@ class Settings(BaseSettings): HEAT_PREDICTIONS_BUCKET: str PLAN_TRIGGER_BUCKET: str EPC_AUTH_TOKEN: str + ORDNANCE_SURVEY_API_KEY: str DB_HOST: str DB_PASSWORD: str DB_USERNAME: str diff --git a/backend/app/db/functions/property_functions.py b/backend/app/db/functions/property_functions.py index 93dc0c49..88b4e87d 100644 --- a/backend/app/db/functions/property_functions.py +++ b/backend/app/db/functions/property_functions.py @@ -11,7 +11,7 @@ from backend.app.db.models.portfolio import ( from sqlalchemy.orm.exc import NoResultFound -def create_property(session: Session, portfolio_id: int, address: str, postcode: str) -> (int, bool): +def create_property(session: Session, portfolio_id: int, address: str, postcode: str, uprn: str) -> (int, bool): """ This function will create a record for the property in the database if it does not exist. If it does exist, it will just update the updated_at field. @@ -25,7 +25,7 @@ def create_property(session: Session, portfolio_id: int, address: str, postcode: try: # Attempt to fetch the existing property existing_property = session.query(PropertyModel).filter_by( - address=address, postcode=postcode, portfolio_id=portfolio_id + uprn=uprn, portfolio_id=portfolio_id ).one() # Update the 'updated_at' field @@ -43,6 +43,7 @@ def create_property(session: Session, portfolio_id: int, address: str, postcode: address=address, postcode=postcode, portfolio_id=portfolio_id, + uprn=uprn, creation_status=PropertyCreationStatus.LOADING, status=PortfolioStatus.ASSESSMENT.value, has_pre_condition_report=False, diff --git a/backend/app/db/models/materials.py b/backend/app/db/models/materials.py index 2ac7ddf4..97085d7a 100644 --- a/backend/app/db/models/materials.py +++ b/backend/app/db/models/materials.py @@ -19,7 +19,6 @@ class MaterialType(enum.Enum): flat_roof_insulation = "flat_roof_insulation" room_roof_insulation = "room_roof_insulation" windows_glazing = "windows_glazing" - iwi_wall_demolition = "iwi_wall_demolition" iwi_vapour_barrier = "iwi_vapour_barrier" diff --git a/backend/app/db/models/portfolio.py b/backend/app/db/models/portfolio.py index 6f865381..f7c0370b 100644 --- a/backend/app/db/models/portfolio.py +++ b/backend/app/db/models/portfolio.py @@ -153,6 +153,7 @@ class PropertyDetailsEpcModel(Base): primary_energy_consumption = Column(Float) co2_emissions = Column(Float) adjusted_energy_consumption = Column(Float) + estimated = Column(Boolean, default=False) class PropertyDetailsSpatial(Base): diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 1704a42f..d35ea98b 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from epc_api.client import EpcClient from etl.epc.Record import EPCRecord +from backend.SearchEpc import SearchEpc from fastapi import APIRouter, Depends from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.orm import sessionmaker @@ -30,6 +31,8 @@ from backend.ml_models.api import ModelApi from backend.Property import Property from etl.epc.DataProcessor import EPCDataProcessor from etl.epc.settings import COLUMNS_TO_MERGE_ON +from etl.solar.SolarPhotoSupply import SolarPhotoSupply + from recommendations.optimiser.CostOptimiser import CostOptimiser from recommendations.optimiser.GainOptimiser import GainOptimiser from recommendations.optimiser.optimiser_functions import prepare_input_measures @@ -43,54 +46,6 @@ logger = setup_logger() BATCH_SIZE = 5 -class DummyDownloader: - - def __init__(self, postcode, address1, id, epc_client): - self.id = id - self.postcode = postcode - self.address1 = address1 - - self.data = None - self.old_data = None - - self.epc_client = epc_client - - def search_address_epc(self): - """ - This method searches for an address in the EPC database and returns the first result - :return: property data - """ - if self.data: - return - - # This will fail if a property does not have an EPC - this has been documented as a case to handle - response = self.epc_client.domestic.search(params={"address": self.address1, "postcode": self.postcode}) - - # Check if we have a full sap EPC - self.full_sap_epc = [r for r in response["rows"] if r["transaction-type"] == "new dwelling"] - self.full_sap_epc = self.full_sap_epc[0] if self.full_sap_epc else self.full_sap_epc - - if len(response["rows"]) > 1: - newest_response = [ - r for r in response["rows"] if - r["lodgement-datetime"] == max([x["lodgement-datetime"] for x in response["rows"]]) - ] - if len(newest_response) > 1: - raise Exception("More than one result found for this address - investigate me") - - # We'll keep old EPCs in case it contains information, not present on the newest one - self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]] - - response["rows"] = newest_response - - self.data = response["rows"][0] - # For the moment, if we don't have a UPRN, we don't do anything about it, however we'll handle this in - # the future by using the Ordnance Survey places API - if not self.data["uprn"]: - logger.warning("We do not have a UPRN for this property") - else: - self.uprn = int(self.data["uprn"]) - router = APIRouter( prefix="/plan", tags=["plan"], @@ -103,37 +58,34 @@ router = APIRouter( @router.post("/trigger") async def trigger_plan(body: PlanTriggerRequest): logger.info("Connecting to db") - # session = sessionmaker(bind=db_engine)() + session = sessionmaker(bind=db_engine)() created_at = datetime.now().isoformat() try: session.begin() logger.info("Getting the inputs") - Body = {'portfolio_id': '56', 'housing_type': 'Social', 'goal': 'Increase EPC', 'goal_value': 'A', 'trigger_file_path': '8/56/windows_portfolio_inputs.csv'} - body = PlanTriggerRequest(**Body) epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN) plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) - uprn_filenames = read_dataframe_from_s3_parquet( - bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet" - ) - cleaning_data = read_parquet_from_s3( - bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", - ) input_properties = [] for config in plan_input: # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly - # TODO: implment validation. We should also standardise postcode and address in some fashion as - # a postcode of abcdef would be considered different to ABCDEF + + epc_searcher = SearchEpc( + address1=config["address"], + postcode=config["postcode"], + auth_token=get_settings().EPC_AUTH_TOKEN, + os_api_key=get_settings().ORDNANCE_SURVEY_API_KEY + ) + epc_searcher.find_property() # Create a record in db property_id, is_new = create_property( - session, portfolio_id=body.portfolio_id, address=config['address'], postcode=config['postcode'] + session, body.portfolio_id, epc_searcher.address_clean, epc_searcher.postcode_clean, epc_searcher.uprn ) # if a new record was not created, we don't produduce recommendations if not is_new: continue - # TODO: Need to add heat demand target create_property_targets( session, @@ -143,29 +95,21 @@ async def trigger_plan(body: PlanTriggerRequest): heat_demand_target=None ) - epc_downloader = DummyDownloader(id=0, epc_client=epc_client, postcode=config['postcode'], address1=config['address']) - epc_downloader.search_address_epc() - epc_records ={ - 'original_epc': epc_downloader.data.copy(), - 'full_sap_epc': epc_downloader.full_sap_epc.copy() if epc_downloader.full_sap_epc else [], - 'old_data': epc_downloader.old_data.copy() if epc_downloader.old_data else [] + 'original_epc': epc_searcher.newest_epc, + 'full_sap_epc': epc_searcher.full_sap_epc, + 'old_data': epc_searcher.old_data, } prepared_epc = EPCRecord(epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data) # This uses all the epc records to clean the data - - p = Property( + + input_properties.append( + Property( id=property_id, address1=config['address'], postcode=config['postcode'], epc_record=prepared_epc, ) - - logger.info("Getting spatial data") - - p.get_spatial_data(uprn_filenames) - input_properties.append( - p ) @@ -180,10 +124,19 @@ async def trigger_plan(body: PlanTriggerRequest): materials = get_materials(session) cleaned = get_cleaned() - logger.info("Getting components and epc recommendations") + uprn_filenames = read_dataframe_from_s3_parquet( + bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet" + ) + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet", + ) + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket=get_settings().DATA_BUCKET) - # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers - # in as a dependency and then the optimisers can take the input measures in as part of the setup() method + logger.info("Getting spatial data") + for p in input_properties: + p.get_spatial_data(uprn_filenames) + + logger.info("Getting components and epc recommendations") recommendations = {} recommendations_scoring_data = [] @@ -192,7 +145,7 @@ async def trigger_plan(body: PlanTriggerRequest): for p in input_properties: # Property recommendations - p.get_components(cleaned) + p.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) recommender = Recommendations(property_instance=p, materials=materials) property_recommendations = recommender.recommend() diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index 7aba99c9..7672c316 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -175,11 +175,34 @@ def create_recommendation_scoring_data( scoring_dict["LOW_ENERGY_LIGHTING_ENDING"] = 100 scoring_dict["LIGHTING_ENERGY_EFF_STARTING"] = "Very Good" + if recommendation["type"] == "windows_glazing": + scoring_dict["MULTI_GLAZE_PROPORTION_ENDING"] = 100 + scoring_dict["WINDOWS_ENERGY_EFF_ENDING"] = "Average" + + is_secondary_glazing = recommendation["is_secondary_glazing"] + + if scoring_dict["glazing_type_ENDING"] == "multiple": + pass + elif scoring_dict["glazing_type_ENDING"] == "single": + scoring_dict["glazing_type_ENDING"] = "secondary" if is_secondary_glazing else "double" + elif scoring_dict["glazing_type_ENDING"] == "double": + scoring_dict["glazing_type_ENDING"] = "multiple" if is_secondary_glazing else "double" + elif scoring_dict["glazing_type_ENDING"] == "secondary": + scoring_dict["glazing_type_ENDING"] = "secondary" if is_secondary_glazing else "multiple" + elif scoring_dict["glazing_type_ENDING"] in ["triple", "high performance"]: + scoring_dict["glazing_type_ENDING"] = "multiple" + else: + raise ValueError("Invalid glazing type - implement me") + + if recommendation["type"] == "solar_pv": + scoring_dict["PHOTO_SUPPLY_ENDING"] = recommendation["photo_supply"] + if recommendation["type"] not in [ "mechanical_ventilation", "sealing_open_fireplace", "low_energy_lighting", "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation", "loft_insulation", "room_roof_insulation", "flat_roof_insulation", - "solid_floor_insulation", "suspended_floor_insulation", "exposed_floor_insulation" + "solid_floor_insulation", "suspended_floor_insulation", "exposed_floor_insulation", + "windows_glazing", "solar_pv" ]: raise NotImplementedError("Implement me") diff --git a/backend/app/utils.py b/backend/app/utils.py index d912a94a..9a03ab21 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -121,19 +121,6 @@ def epc_to_sap_lower_bound(epc: str): raise ValueError("EPC rating should be between A and G") -def read_parquet_from_s3(bucket_name, file_key): - client = boto3.client('s3') - - # Get the object - s3_object = client.get_object(Bucket=bucket_name, Key=file_key) - - # Read the CSV body into a DataFrame - csv_body = s3_object["Body"].read() - df = pd.read_parquet(BytesIO(csv_body)) - - return df - - def save_dataframe_to_s3_parquet(df, bucket_name, file_key): """ Save a pandas DataFrame to S3 as a Parquet file. diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index f5a7e2bb..018b4678 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -19,7 +19,9 @@ class PropertyValuation: 100070505235: 344000, # Based on Zoopla's estimation of 131 School road, which is also semi-detached 100070513306: 182000, # Based on Zoopla's estimation of 61 Simmons Drive 100071306896: 77000, # Based on Flat 2 of 44 Wedgewood Road on Zoopla - 100021192109: 650000 # Based on Zoopla + 100021192109: 650000, # Based on Zoopla + 766249482: 358000, # Based on Zoopla estimate for 19 Spring Lane, 3 bedroom semi-detached + 100120703802: 277000, # Based on Zoopla } # We base our valuation uplifts on a number of sources @@ -93,7 +95,13 @@ class PropertyValuation: value = cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) if not value: - raise ValueError("Have not implemented valuation for this property") + return { + "current_value": None, + "lower_bound_increased_value": None, + "upper_bound_increased_value": None, + "average_increased_value": None, + "average_increase": None + } current_epc = property_instance.data["current-energy-rating"] # We get the spectrum of ratings between the current and target EPC @@ -119,4 +127,5 @@ class PropertyValuation: "lower_bound_increased_value": value * (1 + min_increase), "upper_bound_increased_value": value * (1 + max_increase), "average_increased_value": value * (1 + avg_increase), + "average_increase": value * (1 + avg_increase) - value } diff --git a/backend/ml_models/api.py b/backend/ml_models/api.py index e6947906..bc09f26c 100644 --- a/backend/ml_models/api.py +++ b/backend/ml_models/api.py @@ -2,8 +2,7 @@ import pandas as pd import requests from requests.exceptions import RequestException from utils.logger import setup_logger -from utils.s3 import save_dataframe_to_s3_parquet -from backend.app.utils import read_parquet_from_s3 +from utils.s3 import save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet logger = setup_logger() @@ -125,7 +124,7 @@ class ModelApi: # Retrieve the predictions predictions_df = pd.DataFrame( - read_parquet_from_s3( + read_dataframe_from_s3_parquet( bucket_name=predictions_bucket, file_key=response["storage_filepath"].split(predictions_bucket + "/")[1] ) diff --git a/backend/requirements/base.txt b/backend/requirements/base.txt index 7a925030..3173f7f8 100644 --- a/backend/requirements/base.txt +++ b/backend/requirements/base.txt @@ -35,4 +35,5 @@ mip==1.15.0 boto3==1.28.3 pandas==1.5.3 pyarrow==12.0.1 -textblob \ No newline at end of file +textblob +usaddress==0.5.10 \ No newline at end of file diff --git a/backend/tests/test_property.py b/backend/tests/test_property.py index 871c9291..09594a40 100644 --- a/backend/tests/test_property.py +++ b/backend/tests/test_property.py @@ -9,6 +9,7 @@ from etl.epc_clean.EpcClean import EpcClean mock_epc_response = { "rows": [ { + "tenure": "rental (social)", "lmk-key": 1, "uprn": 1, "number-habitable-rooms": 5, @@ -17,7 +18,7 @@ mock_epc_response = { "inspection-date": "2023-06-01", 'lodgement-datetime': '2023-06-01 20:29:01', "some-other-key": "some-value", - "roof-description": "Roof Description", + "roof-description": "pitched, no insulation", "walls-description": "Walls Description", "windows-description": "Windows Description", "mainheat-description": "Main Heating Description", @@ -37,7 +38,8 @@ mock_epc_response = { "floor-height": 2.5, "total-floor-area": 100, "construction-age-band": "England and Wales: 1967-1975", - "floor-description": "Floor Description" + "floor-description": "Floor Description", + "floor-level": "Ground" }, { "lmk-key": 2, @@ -68,7 +70,8 @@ mock_epc_response = { "floor-height": 2.5, "total-floor-area": 100, "construction-age-band": "England and Wales: 1967-1975", - "floor-description": "Floor Description" + "floor-description": "Floor Description", + "floor-level": "Ground" } ] } @@ -100,7 +103,8 @@ mock_epc_response_dupe = { "floor-height": 2.5, "total-floor-area": 100, "construction-age-band": "England and Wales: 1967-1975", - "floor-description": "Floor Description" + "floor-description": "Floor Description", + "floor-level": "Ground" }, { "lmk-key": 2, @@ -128,7 +132,8 @@ mock_epc_response_dupe = { "floor-height": 2.5, "total-floor-area": 100, "construction-age-band": "England and Wales: 1967-1975", - "floor-description": "Floor Description" + "floor-description": "Floor Description", + "floor-level": "Ground" }, { "lmk-key": 3, @@ -156,36 +161,62 @@ mock_epc_response_dupe = { "floor-height": 2.5, "total-floor-area": 100, "construction-age-band": "England and Wales: 1967-1975", - "floor-description": "Floor Description" + "floor-description": "Floor Description", + "floor-level": "Ground" } ] } class TestProperty: + @pytest.fixture(autouse=True) - def property_instance(self, mock_epc_client, mock_cleaner): - property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client) + def mock_photo_supply_lookup(self): + return pd.DataFrame( + [ + dict( + tenure="rental (social)", + built_form="Detached", + property_type="House", + construction_age_band="England and Wales: 1967-1975", + is_flat=False, + is_pitched=True, + is_roof_room=False, + floor_area_decile=2, + photo_supply_median=40 + ) + ] + ) + + @pytest.fixture(autouse=True) + def mock_floor_area_decile_thresholds(self): + return pd.DataFrame( + {"floor_area_decile_thresholds": [0, 10, 30, 50]} + ) + + @pytest.fixture(autouse=True) + def property_instance(self, mock_cleaner): + property_instance = Property(id=1, postcode="AB12CD", address="Test Address", data=mock_epc_response["rows"][0]) return property_instance @pytest.fixture(autouse=True) - def property_instance_dupe_data(self, mock_epc_client_dupe_data): - property_instance_dupe_data = Property(2, "AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data) + def property_instance_dupe_data(self): + property_instance_dupe_data = Property(id=2, postcode="AB12CD", address="Test Address") return property_instance_dupe_data - @pytest.fixture - def mock_epc_client(self): - mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token")) - mock_epc_client.domestic.search.return_value = mock_epc_response.copy() - mock_epc_client.auth_token = "mocked_auth_token" - return mock_epc_client - - @pytest.fixture - def mock_epc_client_dupe_data(self): - mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token")) - mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy() - mock_epc_client_dupe_data.auth_token = "mocked_auth_token" - return mock_epc_client_dupe_data + # @pytest.fixture + # def mock_epc_client(self): + # mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token")) + # mock_epc_client.domestic.search.return_value = mock_epc_response.copy() + # mock_epc_client.auth_token = "mocked_auth_token" + # return mock_epc_client + # + # @pytest.fixture + # def mock_epc_client_dupe_data(self): + # mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token")) + # mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy() + # mock_epc_client_dupe_data.auth_token = "mocked_auth_token" + # return mock_epc_client_dupe_data @pytest.fixture def mock_cleaner(self): @@ -224,7 +255,11 @@ class TestProperty: } mock_cleaner.cleaned = { - "roof-description": [{"original_description": "Roof Description"}], + "roof-description": [ + {"original_description": "Roof Description"}, + {"original_description": "pitched, no insulation", "is_pitched": True, "is_flat": False, + "is_roof_room": False} + ], "walls-description": [walls_data], "windows-description": [{"original_description": "Windows Description"}], "mainheat-description": [{"original_description": "Main Heating Description"}], @@ -235,37 +270,32 @@ class TestProperty: } return mock_cleaner - def test_init(self, mock_epc_client): - inst1 = Property(0, "AB12CD", "Test Address", epc_client=mock_epc_client) - # Should be mocked auth token - assert inst1.epc_client.auth_token == "mocked_auth_token" + def test_init(self): + inst1 = Property(0, postcode="AB12CD", address="Test Address") - inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client) - assert inst2.epc_client.auth_token + assert inst1.data is None - inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client) - assert inst3.data == {"some": "data"} + inst2 = Property(3, "AB12CD", "Test Address") + assert inst2.id == 3 - data = inst3.search_address_epc() - assert data is None + inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data", "uprn": 123}) + assert inst3.data == {"some": "data", "uprn": 123} - def test_search_address_epc(self, property_instance): - # Call the method to test - property_instance.search_address_epc() - - # Verify that the correct data is being returned - assert property_instance.data == mock_epc_response["rows"][0] - - def test_search_address_epc_multiple_results(self, property_instance_dupe_data, mock_epc_client_dupe_data): - with pytest.raises(Exception, match="More than one result found for this address - investigate me"): - property_instance_dupe_data.search_address_epc() - - def test_get_components(self, property_instance, mock_cleaner, mock_epc_client): - property_instance.search_address_epc() - property_instance.get_components(mock_cleaner.cleaned) + def test_get_components( + self, property_instance, mock_cleaner, mock_photo_supply_lookup, mock_floor_area_decile_thresholds + ): + property_instance.get_components( + mock_cleaner.cleaned, + photo_supply_lookup=mock_photo_supply_lookup, + floor_area_decile_thresholds=mock_floor_area_decile_thresholds + ) # Verify that the components are set correctly - assert property_instance.roof == {"original_description": "Roof Description"} + assert property_instance.roof == { + 'original_description': 'pitched, no insulation', 'is_pitched': True, + 'is_flat': False, 'is_roof_room': False + } + assert property_instance.walls == { "original_description": "Walls Description", "is_cavity_wall": True, @@ -289,24 +319,15 @@ class TestProperty: # Verify that ValueError is raised when EpcClean doesn't contain cleaned data with pytest.raises(ValueError, match="Cleaner does not contain cleaned data"): - property_instance.get_components(mock_cleaner.cleaned) + property_instance.get_components(mock_cleaner.cleaned, pd.DataFrame(), pd.DataFrame()) - def test_get_components_no_data(self, property_instance, mock_cleaner): + def test_get_components_no_attributes( + self, property_instance, mock_cleaner, mock_photo_supply_lookup, mock_floor_area_decile_thresholds + ): # Modify the mock cleaner to have no attributes for a specific description mock_cleaner.cleaned = { "roof-description": [] } - - # Verify that ValueError is raised when no attributes are found - with pytest.raises(ValueError, match="Property does not contain data"): - property_instance.get_components(mock_cleaner.cleaned) - - def test_get_components_no_attributes(self, property_instance, mock_cleaner): - # Modify the mock cleaner to have no attributes for a specific description - mock_cleaner.cleaned = { - "roof-description": [] - } - property_instance.search_address_epc() property_instance.data["roof-description"] = "Pitched, no insulation" property_instance.walls = { "original_description": "Walls Description", @@ -327,14 +348,17 @@ class TestProperty: } # Assert backup cleaning has been applied - property_instance.get_components(mock_cleaner.cleaned) + property_instance.get_components( + mock_cleaner.cleaned, mock_photo_supply_lookup, mock_floor_area_decile_thresholds + ) assert property_instance.roof["clean_description"] == "Pitched, no insulation" assert property_instance.roof["is_pitched"] - def test_get_components_multiple_attributes(self, property_instance, mock_cleaner): + def test_get_components_multiple_attributes( + self, property_instance, mock_cleaner, mock_photo_supply_lookup, mock_floor_area_decile_thresholds + ): # This shouldn't happen - it would mean a cleaning error - property_instance.search_address_epc() property_instance.data["roof-description"] = "Roof Description" cleaned = { "roof-description": [ @@ -345,10 +369,10 @@ class TestProperty: # Verify that ValueError is raised when multiple attributes are found with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"): - property_instance.get_components(cleaned) + property_instance.get_components(cleaned, mock_photo_supply_lookup, mock_floor_area_decile_thresholds) - def test_set_spatial(self, mock_epc_client): - prop = Property(1, "AB12CD", "Test Address", mock_epc_client) + def test_set_spatial(self): + prop = Property(1, postcode="AB12CD", address="Test Address") spatial1 = pd.DataFrame([{ 'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238, @@ -362,7 +386,7 @@ class TestProperty: assert prop.is_heritage assert prop.restricted_measures - prop2 = Property(1, "AB12CD", "Test Address", mock_epc_client) + prop2 = Property(1, "AB12CD", "Test Address") spatial2 = pd.DataFrame([{ 'X_COORDINATE': 411143.0, 'Y_COORDINATE': 281701.0, 'LATITUDE': 52.4331896, 'LONGITUDE': -1.8375238, @@ -376,10 +400,10 @@ class TestProperty: assert not prop2.is_heritage assert not prop2.restricted_measures - def test_set_floor_level(self, mock_epc_client): + def test_set_floor_level(self): # In this case, we have a flat which looks looks it's on the first floor, but it's actually on the ground # floor, so we should set floor_level to 0 - prop = Property(1, "AB12CD", "Test Address", mock_epc_client) + prop = Property(1, postcode="AB12CD", address="Test Address") prop.data = {'floor-level': '01', 'property-type': 'Flat'} prop.floor = { 'original_description': 'Solid, no insulation (assumed)', 'clean_description': 'Solid, no insulation', @@ -395,7 +419,7 @@ class TestProperty: # This property is labelled as being on the ground floor but actually has another property below # so we set floor level to 1 - prop2 = Property(1, "AB12CD", "Test Address", mock_epc_client) + prop2 = Property(1, postcode="AB12CD", address="Test Address") prop2.data = {'floor-level': 'Ground', 'property-type': 'Flat'} prop2.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', @@ -410,7 +434,7 @@ class TestProperty: assert prop2.floor_level == 1 # this property is correctly labelled as being on the 2nd floor - prop3 = Property(1, "AB12CD", "Test Address", mock_epc_client) + prop3 = Property(1, postcode="AB12CD", address="Test Address") prop3.data = {'floor-level': '02', 'property-type': 'Flat'} prop3.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', @@ -425,7 +449,7 @@ class TestProperty: assert prop3.floor_level == 2 # Example of a house - prop4 = Property(1, "AB12CD", "Test Address", mock_epc_client) + prop4 = Property(1, postcode="AB12CD", address="Test Address") prop4.data = {'floor-level': '', 'property-type': 'House'} prop4.floor = { 'original_description': '(Another dwelling below)', 'clean_description': 'Solid, no insulation', diff --git a/backend/tests/test_sap_model_prep.py b/backend/tests/test_sap_model_prep.py index f20e4993..89c436ce 100644 --- a/backend/tests/test_sap_model_prep.py +++ b/backend/tests/test_sap_model_prep.py @@ -2,13 +2,11 @@ from backend.Property import Property from etl.epc.DataProcessor import DataProcessor from backend.app.plan.utils import create_recommendation_scoring_data, get_cleaned from etl.epc.settings import COLUMNS_TO_MERGE_ON -from epc_api.client import EpcClient import pandas as pd import pytest import msgpack from utils.s3 import read_dataframe_from_s3_parquet, read_from_s3 -from tqdm import tqdm # Handy code for selecting testing data @@ -122,7 +120,21 @@ class TestSapModelPrep: cleaned = msgpack.unpackb(cleaned, raw=False) return cleaned - def test_fill_cavity_wall(self, cleaned, cleaning_data): + @pytest.fixture + def photo_supply_lookup(self): + photo_supply_lookup = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="solar_pv_supply/photo_supply_lookup.parquet", + ) + return photo_supply_lookup + + @pytest.fixture + def floor_area_decile_thresholds(self): + floor_area_decile_thresholds = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="solar_pv_supply/floor_area_decile_thresholds.parquet", + ) + return floor_area_decile_thresholds + + def test_fill_cavity_wall(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): """ We ensure that the process that prepares the data in the engine code results in the same data as the model is trained on @@ -288,11 +300,10 @@ class TestSapModelPrep: home = Property( id=0, postcode=starting_epc["postcode"], - address1=starting_epc["address1"], - epc_client=EpcClient(auth_token="notoken"), + address=starting_epc["address1"], data=starting_epc ) - home.get_components(cleaned) + home.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) data_processor = DataProcessor(None, newdata=True) data_processor.insert_data(pd.DataFrame([home.get_model_data()])) @@ -356,7 +367,7 @@ class TestSapModelPrep: assert test_record[c].values[0] == row[c] - def test_internal_wall_insulation(self, cleaned, cleaning_data): + def test_internal_wall_insulation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): starting_epc2 = { 'low-energy-fixed-light-count': '2', 'address': 'FLAT 12, WAREHOUSE W, 3 WESTERN GATEWAY', @@ -508,11 +519,10 @@ class TestSapModelPrep: home2 = Property( id=0, postcode=starting_epc2["postcode"], - address1=starting_epc2["address1"], - epc_client=EpcClient(auth_token="notoken"), + address=starting_epc2["address1"], data=starting_epc2 ) - home2.get_components(cleaned) + home2.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) home2.set_number_lighting_outlets(None) data_processor2 = DataProcessor(None, newdata=True) @@ -578,7 +588,7 @@ class TestSapModelPrep: assert test_record2[c].values[0] == row2[c] - def test_ventilation(self, cleaned, cleaning_data): + def test_ventilation(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): starting_epc3 = { 'low-energy-fixed-light-count': '', 'address': '45 Shepperson Road', 'uprn-source': 'Energy Assessor', @@ -728,11 +738,10 @@ class TestSapModelPrep: home3 = Property( id=0, postcode=starting_epc3["postcode"], - address1=starting_epc3["address1"], - epc_client=EpcClient(auth_token="notoken"), + address=starting_epc3["address1"], data=starting_epc3 ) - home3.get_components(cleaned) + home3.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) home3.set_number_lighting_outlets(None) data_processor3 = DataProcessor(None, newdata=True) @@ -782,7 +791,7 @@ class TestSapModelPrep: assert test_record3[c].values[0] == row3[c] - def test_fireplaces(self, cleaned, cleaning_data): + def test_fireplaces(self, cleaned, cleaning_data, photo_supply_lookup, floor_area_decile_thresholds): starting_epc4 = { 'low-energy-fixed-light-count': '', 'address': '9 Glebe Road, Asfordby Hill', @@ -937,11 +946,10 @@ class TestSapModelPrep: home4 = Property( id=0, postcode=starting_epc4["postcode"], - address1=starting_epc4["address1"], - epc_client=EpcClient(auth_token="notoken"), + address=starting_epc4["address1"], data=starting_epc4 ) - home4.get_components(cleaned) + home4.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds) home4.set_number_lighting_outlets(None) data_processor4 = DataProcessor(None, newdata=True) diff --git a/etl/costs/app.py b/etl/costs/app.py index 4d53ce28..30eff735 100644 --- a/etl/costs/app.py +++ b/etl/costs/app.py @@ -75,6 +75,7 @@ def app(): ewi_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="external_wall_insulation", header=0) lel_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="low_energy_lighting", header=0) flat_roof_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="flat_roof_insulation", header=0) + window_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="window_glazing", header=0) # Form a single table to be uploaded costs = pd.concat( diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index f25d06bd..13966655 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -33,6 +33,7 @@ class Eligibility: # If the loft has less than 100mm of insulation, we classify the home has needing loft insulation LOFT_INSULATION_THRESHOLD = 100 + HIGH_LOFT_INSULATION_THRESHOLD = 269 # Because EPCS have different values for tenure, we need to remap them to a common set of values tenure_remap = { @@ -104,6 +105,8 @@ class Eligibility: self.LOFT_INSULATION_THRESHOLD if loft_thickness_threshold is None else loft_thickness_threshold ) + high_loft_thickness_threshold = self.HIGH_LOFT_INSULATION_THRESHOLD + # We firstly check if the roof is a loft is_loft = self.roof["is_pitched"] and (not self.roof["is_roof_room"]) @@ -122,7 +125,22 @@ class Eligibility: is_flat=self.roof["is_flat"] ) - if insulation_thickness > loft_thickness_threshold: + if insulation_thickness <= loft_thickness_threshold: + self.loft = { + "suitability": True, + "thickness": insulation_thickness, + "reason": None + } + + if insulation_thickness <= high_loft_thickness_threshold: + self.loft = { + "suitability": True, + "thickness": insulation_thickness, + "reason": "high loft thickness but below regulation" + } + return + + if insulation_thickness > high_loft_thickness_threshold: # Insulation is already thick enough self.loft = { "suitability": False, @@ -131,12 +149,6 @@ class Eligibility: } return - self.loft = { - "suitability": True, - "thickness": insulation_thickness, - "reason": None - } - def cavity_insulation(self): """ @@ -152,9 +164,25 @@ class Eligibility: is_partial_filled = ( self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["below average"] ) + # We look for potentially under performing cavities - anything that is assumed, as built and insulated + is_underperforming = ( + self.walls["is_as_built"] and self.walls["insulation_thickness"] in ["average"] and self.walls["is_assumed"] + ) is_unfilled_cavity = is_cavity and is_empty is_partial_filled_cavity = is_cavity and is_partial_filled + is_underperforming_cavity = is_cavity and is_underperforming + + # Check if it has internal or external wall insulation + has_internal_wall_insulation = self.walls["internal_insulation"] + has_external_wall_insulation = self.walls["external_insulation"] + + if has_internal_wall_insulation or has_external_wall_insulation: + self.cavity = { + "suitability": False, + "type": "internal or external wall insulation" + } + return if is_unfilled_cavity: self.cavity = { @@ -170,6 +198,13 @@ class Eligibility: } return + if is_underperforming_cavity: + self.cavity = { + "suitability": True, + "type": "underperforming" + } + return + self.cavity = { "suitability": False, "type": "full" @@ -223,6 +258,14 @@ class Eligibility: } def suspended_floor_insulation(self): + + if "no_data" in self.floor.keys(): + if self.floor["no_data"]: + self.suspended_floor = { + "suitability": False, + } + return + is_suspended = self.floor["is_suspended"] is_insulated = self.floor["insulation_thickness"] in ["average", "above average"] @@ -232,6 +275,14 @@ class Eligibility: return def solid_floor_insulation(self): + + if "no_data" in self.floor.keys(): + if self.floor["no_data"]: + self.solid_floor = { + "suitability": False, + } + return + is_solid = self.floor["is_solid"] is_insulated = self.floor["insulation_thickness"] in ["average", "above average"] @@ -305,7 +356,8 @@ class Eligibility: """ current_sap = int(self.epc["current-energy-efficiency"]) - if current_sap > 54: + + if current_sap >= 69: self.eco4_warmfront = { "eligible": False, "message": "sap too high" @@ -319,9 +371,22 @@ class Eligibility: is_eligible = self.cavity["suitability"] & self.loft["suitability"] if post_retrofit_sap is None: + + if current_sap >= 55: + message = "Possibly eligible but property currently EPC D" + else: + message = "subject to post retrofit sap" if is_eligible else "not eligible" + + # Update the message to flag properties that failed just because of a full cavity. + # We need to double check that the wall is a cavity, that the loft is suitable and that the + # sap is within reason + # We can then estimate the age of the cavity fill + if not is_eligible and (current_sap < 69) and self.loft["suitability"] and self.walls["is_cavity_wall"]: + message = "Failed due to full cavity - check cavity age" + self.eco4_warmfront = { "eligible": is_eligible, - "message": "subject to post retrofit sap" + "message": message } return diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 3c7ae901..76aadcc4 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -11,13 +11,12 @@ import numpy as np import msgpack from datetime import datetime, timedelta from utils.logger import setup_logger -from utils.s3 import read_from_s3 +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from backend.Property import Property from etl.eligibility.Eligibility import Eligibility from etl.epc.DataProcessor import DataProcessor -from backend.app.utils import read_parquet_from_s3 from backend.app.plan.utils import create_recommendation_scoring_data from etl.epc.settings import COLUMNS_TO_MERGE_ON from backend.ml_models.api import ModelApi @@ -247,6 +246,8 @@ def merge_ha_15(asset_list, identified_addresses): identified_addresses = identified_addresses.drop_duplicates("merge_key") + # We pull out raw counts for the survey lists + # Check asset list for dupes asset_list_dupes = asset_list["merge_key"].duplicated() if asset_list_dupes.sum(): @@ -336,7 +337,10 @@ def merge_ha_15(asset_list, identified_addresses): return merged_data, dropped_identified_merge_keys -def prepare_model_data_row(property_id, modelling_epc, cleaned, cleaning_data, created_at): +def prepare_model_data_row( + property_id, modelling_epc, cleaned, cleaning_data, created_at, + photo_supply_lookup, floor_area_decile_thresholds, old_data=None, full_sap_epc=None, +): """ This function prepares the data for modelling, in the same fashion as the recommendation engine With up-coming refactoring, this will change @@ -346,15 +350,24 @@ def prepare_model_data_row(property_id, modelling_epc, cleaned, cleaning_data, c p = Property( id=property_id, postcode=modelling_epc["postcode"], - address1=modelling_epc["address1"], - epc_client=None, - data=modelling_epc + address=modelling_epc["address1"], + data=modelling_epc, + old_data=old_data, + full_sap_epc=full_sap_epc ) - p.get_components(cleaned) + p.get_components(cleaned, photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds) + + # THIS IS TEMP AND SHOULDN'T BE HERE + data_to_clean = p.get_model_data() + if data_to_clean["NUMBER_HEATED_ROOMS"] in ['', None]: + data_to_clean["NUMBER_HEATED_ROOMS"] = data_to_clean["NUMBER_HABITABLE_ROOMS"] + p.data["number-heated-rooms"] = data_to_clean["NUMBER_HABITABLE_ROOMS"] + # This is temp - this should happen after scoring cleaned_property_data = DataProcessor.apply_averages_cleaning( - data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]), + data_to_clean=pd.DataFrame([dict(**data_to_clean, LOCAL_AUTHORITY=p.data["local-authority"])]), cleaning_data=cleaning_data, cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], ) @@ -829,6 +842,18 @@ def analyse_ha_32_results(results, ha32, no_house_numbers): results_df["warmfront_identified"] ] + # Aggregates of no eco and gbis jobs identified + n_eco = results_df["eco4_eligible"].sum() + # Gbis is rows where eco4 is not eligible + n_gbis = results_df[ + (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False) + ]["gbis_eligible"].sum() + + pipeline_potential = results_df[ + (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | ( + results_df["gbis_eligible"] == True) + ] + success_rate = warmfront_identified["gbis_eligible"].sum() / warmfront_identified.shape[0] # For HA32, this is 89% @@ -886,8 +911,16 @@ def analyse_ha_32_results(results, ha32, no_house_numbers): new_possibilities = results_df[ (~results_df["warmfront_identified"]) & - (results_df["gbis_eligible"] | results_df["eco4_eligible"]) & - (results_df["tenure"] == "Rented (social)") + (results_df["gbis_eligible"] | results_df["eco4_eligible"]) + ].copy() + + new_possibilities_eco = results_df[ + (~results_df["warmfront_identified"]) & + (results_df["eco4_eligible"] == True) + ].copy() + new_possibilities_gbis = results_df[ + (~results_df["warmfront_identified"]) & + (results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True) ].copy() future_possibilities_eco = results_df[ @@ -947,6 +980,8 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers): results_df["warmfront_identified"] ] + warmfront_identified = warmfront_identified + n_identified = (warmfront_identified["gbis_eligible"] | warmfront_identified["eco4_eligible"]).sum() success_rate = n_identified / warmfront_identified.shape[0] @@ -955,6 +990,11 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers): "eligibility_classification"].value_counts() # For HA15 this is 50.3% + pipeline_potential = results_df[ + (results_df["warmfront_identified"] == True) | (results_df["eco4_eligible"] == True) | ( + results_df["gbis_eligible"] == True) + ] + # of the properties we identify, what is the mix of confidenc missed = results_df[ @@ -973,32 +1013,37 @@ def analyse_ha_15_results(results_df, ha15, no_house_numbers): missed["sap"] < 69 ] - sap_low_enough["walls"].value_counts() - z = ha15[ha15["row_id"].isin(sap_too_high["row_id"].values)] - - investigate_1 = ha15[ha15["row_id"].isin(sap_too_high["row_id"])][ - ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]] - - investigate_2 = ha15[ha15["row_id"].isin(sap_low_enough["row_id"])][ - ["row_id", "Postcode", "Address Line 1", "Address Line 2", "Address Line 3"]] - - missed["message"].value_counts() + # Aggregates of no eco and gbis jobs identified + n_eco = results_df["eco4_eligible"].sum() + # Gbis is rows where eco4 is not eligible + n_gbis = results_df[ + (results_df["gbis_eligible"] == True) & (results_df["eco4_eligible"] == False) + ]["gbis_eligible"].sum() # We now look for properties that we identified, that were not identified by Warmfront new_possibilities = results_df[ (~results_df["warmfront_identified"]) & - ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True)) & - (results_df["tenure"] == "Rented (social)") + ((results_df["gbis_eligible"] == True) | (results_df["eco4_eligible"] == True)) + ].copy() + + new_possibilities_eco = results_df[ + (~results_df["warmfront_identified"]) & + (results_df["eco4_eligible"] == True) + ].copy() + + new_possibilities_gbis = results_df[ + (~results_df["warmfront_identified"]) & + (results_df["eco4_eligible"] == False) & (results_df["gbis_eligible"] == True) ].copy() # These are future possibilityies - new_possibilities_eco = results_df[ + future_possibilities_eco = results_df[ (~results_df["warmfront_identified"]) & (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) ].copy() - new_possibilities_gbis = results_df[ + future_possibilities_gbis = results_df[ (~results_df["warmfront_identified"]) & (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & ( ~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) @@ -1058,7 +1103,7 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) - cleaning_data = read_parquet_from_s3( + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) diff --git a/etl/eligibility/ha_15_32/ha16_app.py b/etl/eligibility/ha_15_32/ha16_app.py new file mode 100644 index 00000000..0d67e0b4 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha16_app.py @@ -0,0 +1,647 @@ +import os +import msgpack +import openpyxl +from pathlib import Path +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3 +from utils.logger import setup_logger +from dotenv import load_dotenv +from utils.s3 import read_dataframe_from_s3_parquet +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +import re + +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_data(): + # This asset list is spread across two sheets, which we need to combine + + asset_list_filenames = [ + "HESTIA - HA 16 ASSET LIST PART 1 OF 2.xlsx", + "HESTIA - HA 16 ASSET LIST PART 2 OF 2.xlsx", + ] + + # Prepare lists to collect rows data and their colors + rows_data = [] + rows_colors = [] + colnames = [] + for asset_list_filename in asset_list_filenames: + workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/{asset_list_filename}') + sheet = workbook.active + sheet_colnames = [cell.value for cell in sheet[1]] + colnames.append(sheet_colnames) + + for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + rows_data.append(row_data) + rows_colors.append(row_color) + + asset_list = pd.DataFrame(rows_data, columns=colnames[0]) + # Remove None columns + asset_list = asset_list.iloc[:, 0:12] + asset_list['row_color'] = rows_colors + + asset_list["row_colour_name"] = np.where( + asset_list["row_color"] == "FFFF0000", "red", + np.where(asset_list["row_color"] == "FF92D050", "green", "yellow") + ) + + # Split up the address on commas, which is useful for matching later + split_addresses = asset_list['Address'].str.split(',', expand=True) + split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] + + asset_list = pd.concat([asset_list, split_addresses], axis=1) + # There is no commas separating house number and address 1 + split_addresses2 = asset_list['temp'].str.split(' ', expand=True) + split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"] + # We could re-concatenate but we only care about HouseNo for the moment + asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1) + + # We now read in the survey list + survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx') + survey_sheet = survey_workbook.active + + survey_rows = [] + survey_colors = [] + + for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + survey_rows.append(row_data) + survey_colors.append(row_color) + + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + + # For the survey list, we don't need the colours, since there is a column called "INSTALLED OR CANCELLED" + # which describes the status of the property + survey_list["row_colour"] = survey_colors + survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))] + # Tidy up the street/block name a bit + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "REEDS RD", + "Reeds ROAD", + survey_list["Street / Block Name"] + ) + # Replace " rd " with "road" + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road', regex=True) + + # Replace " , " with ", " + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace( + " , ", ', ', + ) + # Fix "{place} ,{place}" with "{place}, {place}" + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ', regex=True) + # Strip whitespace + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip() + + # Correct errors + survey_list["Post Code"] = np.where( + survey_list["Post Code"] == "M38 0SA", + "M38 9SA", + survey_list["Post Code"] + ) + + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"), + "M44 5JF", + survey_list["Post Code"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", "chatley road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road", + "plantation avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive", + "howclough drive") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane", + "brookhurst lane") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road", + "birch road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road", + "hodson road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue", + "narbonne avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cumberland road, cadishead", + "cumberland avenue, cadishead") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive", + "ashton field drive") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road", + "wedgwood road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close", + "hamilton avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("lichens crescent, fitton hill", + "lichens crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill", + "south croft") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr", "fir tree avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road", + "hawthorn crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue", + "reins lee avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road", + "wester hill road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road", + "saint martins road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue", + "timperley close") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road", + "eastwood avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road", + "grasmere road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road", + "hulton avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue", + "beechfield road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue", + "princes avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent", + "edge fold crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue", + "coniston avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent", + "blackthorn crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road", + "wellstock lane") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue", + "brackley street") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton", + "brook avenue, swinton") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton", + "green avenue, swinton") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley", + "grasmere avenue, wardley") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle", + "mardale avenue, wardle") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove", + "cartleach Grove") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove", + "arbor Grove") + + # Replacement for clively avenue 66-68 + survey_list["NO."] = np.where( + survey_list["NO."] == "66-68", + "66", + survey_list["NO."] + ) + + # asset_list[asset_list["Address"].str.lower().str.contains("clively")] + + # We now need to merge the survey list onto the asset list + # Could be easier just to do a search on each row, even though it's much slower + matched = [] + for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): + + house_number = row["NO."] + if isinstance(house_number, str): + house_number = house_number.lower() + + # Filter on the first line of the address + df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy() + # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + df = df[df["Address"].str.lower().str.contains(str(house_number))] + if df.shape[0] != 1: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + if df.shape[0] != 1: + raise ValueError("Investigate") + + matched.append( + { + "survey_key": row["survey_key"], + "matched_address": df["Address"].values[0], + "survey_house_no": row["NO."], + "survey_street_name": row["Street / Block Name"], + "survey_postcode": row["Post Code"], + "survey_status": row["INSTALLED OR CANCELLED"] + } + ) + + matched = pd.DataFrame(matched) + matched["warmfront_identified"] = True + + # Combine asset list and surveys + data = asset_list.merge( + matched, how="left", left_on="Address", right_on="matched_address", + ) + data["warmfront_identified"] = data["warmfront_identified"].fillna(False) + + return data, survey_list + + +def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds): + scoring_data = [] + results = [] + nodata = [] + + property_type_lookup = { + 'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"}, + 'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"}, + 'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"}, + 'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"}, + 'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"}, + 'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"}, + 'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"}, + 'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Detached House': {"property-type": "House", "built-form": "Detached"}, + 'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"}, + 'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"}, + 'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"}, + } + + for index, property_meta in tqdm(data.iterrows(), total=len(data)): + + searcher = SearchEpc( + address1=property_meta["HouseNo"], + postcode=property_meta["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=property_meta["Address"] + ) + searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Type"]]["property-type"] + searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["Type"]]["built-form"] + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + + if searcher.newest_epc.get("estimated"): + # We insert the row ID as our proxy for UPRN + proxy_uprn = int(property_meta["row_id"].split("_")[1]) + searcher.newest_epc["uprn"] = proxy_uprn + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + # We also want to get the penultimate epc + penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + if not penultimate_epc: + penultimate_epc = newest_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront): + eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + # If this is the case, we need to update the older epcs + # We don't update just to make data cleaning easier + if penultimate_epc.get("estimated") is None: + older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + + # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity + + # Loft MUST be suitable + cavity_age = None + if ( + eligibility.walls["is_cavity_wall"] and + eligibility.walls["is_filled_cavity"] and + eligibility.loft["suitability"] and + eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + ): + # We check the age of the cavity and if it's particularly old, we flag it + cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) + + # Full checks + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + if eligibility.epc["uprn"] == "": + eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1]) + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["row_id"], + "uprn": eligibility.epc["uprn"], + "Address": property_meta["Address"], + "Postcode": property_meta["Postcode"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + "loft_thickness": eligibility.roof["insulation_thickness"], + "cavity_age": cavity_age, + **eligibility.walls, + **eligibility.roof, + } + ) + + scoring_df = pd.DataFrame(scoring_data) + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + scoring_df["UPRN"] = scoring_df["UPRN"].astype(int) + + model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + return results_df, scoring_data, nodata + + +def analyse_results(results_df, data, survey_list): + analysis_data = data[["row_id", "survey_key", "warmfront_identified", "row_colour_name"]].merge( + results_df, how="left", on="row_id" + ).merge( + survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}), + how="left", on="survey_key" + ) + + analysis_data["roof_insulation_thickness"] = np.where( + pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"] + ) + analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply( + lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True) + ) + + warmfront_sold_eco4 = analysis_data[ + (analysis_data["warmfront_identified"] == True) & ( + analysis_data["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])) + ] # 1407 + + warmfront_sold_gbis = analysis_data[ + (analysis_data["warmfront_identified"] == True) & ( + analysis_data["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])) + ] + + ideal_eco4_warmfront_not_sold = analysis_data[ + (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & ( + analysis_data["roof_insulation_thickness_numeric"] <= 100) + ] + + secondary_eco4_warmfront_not_sold = analysis_data[ + (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & ( + analysis_data["roof_insulation_thickness_numeric"] > 100) + ] + + # underperforming cavities + underperforming_cavities = analysis_data[ + (analysis_data["eco4_message"] == "Failed due to full cavity - check cavity age") & ( + analysis_data["cavity_age"] > 10 * 365 + ) & (analysis_data["roof_insulation_thickness_numeric"] <= 100) + ] + + identified_gbis_not_sold = analysis_data[ + (analysis_data["gbis_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & ( + analysis_data["eco4_eligible"] == False + ) + ] + + eco_eligible = analysis_data[analysis_data["eco4_eligible"] == True] + eco_ineligible = analysis_data[analysis_data["eco4_eligible"] == False] + + eco_ineligible["eco4_message"].value_counts() + + # SAP too high: + sap_too_high = eco_ineligible[eco_ineligible["eco4_message"] == "sap too high"].copy() + further_possibilities = sap_too_high[ + sap_too_high["walls"].isin( + [ + "Cavity wall, as built, insulated", + "Cavity wall, as built, no insulation", + "Cavity wall, as built, partial insulation", + "Cavity wall, no insulation", + "Cavity wall, partial insulation" + ] + ) + ] + + filled_cavities = eco_ineligible[ + eco_ineligible["eco4_message"] == "sap too high" + ] + + warmfront_identified = analysis_data[analysis_data["warmfront_identified"]] + warmfront_identified["walls"].value_counts() + + all_identified_gbis = analysis_data[ + (analysis_data["warmfront_identified"] & analysis_data["funding_scheme"].isin( + ["ECO4 GBIS (ECO+)"])) | + (analysis_data["gbis_eligible"] & analysis_data["eco4_eligible"].isin([False, None])) + ] + + empty_cavity_desriptions = [ + "Cavity wall, as built, no insulation", "Cavity wall, as built, partial insulation", + "Cavity wall, no insulation", "Cavity wall, partial insulation" + ] + + empty_cavities = analysis_data[analysis_data["walls"].isin(empty_cavity_desriptions)] + remaining_empty = empty_cavities[~empty_cavities["warmfront_identified"]] + + warmfront_identified = analysis_data[analysis_data["warmfront_identified"]] + + # Of the ECO jobs, what proportion to we get right + warmfront_identified_eco = warmfront_identified[ + warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"]) + ] + + eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0] + + warmfront_identified_gbis = warmfront_identified[ + warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"]) + ] + + gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0] + + # Additional identified + additional_identified_eco = analysis_data[ + (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) + ] + + additional_identified_eco["eligibility_classification"].value_counts() + + additional_identified_gbis = analysis_data[ + (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & ( + analysis_data["warmfront_identified"] == False + ) + ].shape[0] + # Future + additional_identified_eco_future = analysis_data[ + (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False) + ].shape[0] + additional_identified_gbis_future = analysis_data[ + (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & ( + analysis_data["warmfront_identified"] == False + ) + ].shape[0] + + +def app(): + data, survey_list = load_data() + + data["row_id"] = ["ha16_" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + results_df, scoring_data, nodata = get_epc_data( + data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + ) + + # Store + # Old file was ha16.pickle + # import pickle + # with open("ha16_10_jan.pickle", "wb") as f: + # pickle.dump( + # { + # "scoring_data": scoring_data, + # "results": results_df, + # "nodata": nodata + # }, f + # ) + + # Read pickle + # import pickle + # with open("ha16_10_jan.pickle", "rb") as f: + # saved = pickle.load(f) + # scoring_data = saved["scoring_data"] + # results_df = saved["results"] + # nodata = saved["nodata"] diff --git a/etl/eligibility/ha_15_32/ha24_app.py b/etl/eligibility/ha_15_32/ha24_app.py new file mode 100644 index 00000000..dc4df018 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha24_app.py @@ -0,0 +1,524 @@ +import os +import msgpack +import openpyxl +from pathlib import Path +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet +from utils.logger import setup_logger +from dotenv import load_dotenv +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_data(): + workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ASSET LIST.xlsx') + sheet = workbook.active + sheet_colnames = [cell.value for cell in sheet[1]] + + rows_data = [] + rows_colors = [] + for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + rows_data.append(row_data) + rows_colors.append(row_color) + + asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) + # Remove None columns + asset_list = asset_list.iloc[:, 0:10] + asset_list['row_color'] = rows_colors + + asset_list["row_colour_name"] = np.where( + asset_list["row_color"] == "FFFF0000", "red", + np.where(asset_list["row_color"] == "FF92D050", "green", "yellow") + ) + + asset_list["row_colour_code"] = np.where( + asset_list["row_colour_name"] == "red", "does not meet criteria", + np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future") + ) + + # The third column is listed as "Address" but it's actually the postcode". We have two Address columns so we + # change just the third + asset_list.columns.values[2] = "Postcode" + + # Split up the address on commas, which is useful for matching later + split_addresses = asset_list['Address'].str.split(',', expand=True) + split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5', 'address6'] + + asset_list = pd.concat([asset_list, split_addresses], axis=1) + # There is no commas separating house number and address 1 + split_addresses2 = asset_list['temp'].str.split(' ', expand=True) + split_addresses2.columns = ['HouseNo', 'part1', 'part2', "part3", "part4"] + # We could re-concatenate but we only care about HouseNo for the moment + asset_list = pd.concat([asset_list, split_addresses2[["HouseNo"]]], axis=1) + + # Read in surveys + survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx') + survey_sheet = survey_workbook.active + + survey_rows = [] + survey_colors = [] + + for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + survey_rows.append(row_data) + survey_colors.append(row_color) + + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + + survey_list["row_colour"] = survey_colors + survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))] + # Tidy up the street/block name a bit + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip() + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, nidds lane", "nidds lane" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "wirral avenue", "wirrall avenue" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st ives road", "st. ives crescent" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "sundringham road", "sandringham road" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "milton avenue", "milton road" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st ives crescent", "st. ives crescent" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, waterbelly lane", "waterbelly lane" + ) + # Generally remove "councile house, " from the start of the street name + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, ", "" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st. leodegars close", "st leodegars close" + ) + + # asset_list[asset_list["Address"].str.lower().str.contains("wirral")]["Address"] + + # Drop all None rows + survey_list = survey_list[~pd.isnull(survey_list["Street / Block Name"])] + survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(survey_list))] + + matched = [] + for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): + house_number = row["NO."] + if isinstance(house_number, str): + house_number = house_number.lower() + + # Filter on the first line of the address + df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy() + # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + df = df[df["Address"].str.lower().str.contains(str(house_number))] + if df.shape[0] != 1: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"].lower()) + raise ValueError("Investigate") + + matched.append( + { + "survey_key": row["survey_key"], + "matched_address": df["Address"].values[0], + "survey_house_no": row["NO."], + "survey_street_name": row["Street / Block Name"], + "survey_postcode": row["Post Code"], + "survey_status": row["INSTALLED OR CANCELLED"] + } + ) + + matched = pd.DataFrame(matched) + matched["warmfront_identified"] = True + + # Combine asset list and surveys + data = asset_list.merge( + matched, how="left", left_on="Address", right_on="matched_address", + ) + data["warmfront_identified"] = data["warmfront_identified"].fillna(False) + + return data, survey_list + + +def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds): + scoring_data = [] + results = [] + nodata = [] + + property_type_lookup = { + "01 HOUSE": "House", + "02 FLAT": "Flat", + "03 BUNGALOW": "Bungalow", + "05 BEDSIT": "Flat", + "04 MAISONETTE": "Maisonette", + "01 HOUSE MID": "House", + "10 PBUNGALOW": "Bungalow", + "14 SFLAT": "Flat", + "12 SBEDSIT": "Flat", + "11 PFLAT": "Flat", + "13 SBUNGALOW": "Bungalow", + " 01 HOUSE MID": "House", + "09 PBEDSIT": "Flat" + } + + for _, property_meta in tqdm(data.iterrows(), total=len(data)): + + searcher = SearchEpc( + address1=property_meta["HouseNo"], + postcode=property_meta["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=property_meta["Address"] + ) + searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["Property Type"]] + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + # We also want to get the penultimate epc + penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + if not penultimate_epc: + penultimate_epc = newest_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront): + eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + # If this is the case, we need to update the older epcs + # older_epcs = [ + # x for x in older_epcs if x["lmk-key"] not in [newest_epc["lmk-key"], penultimate_epc["lmk-key"]] + # ] + # If this is the case, we need to update the older epcs + # We don't update just to make data cleaning easier + if penultimate_epc.get("estimated") is None: + older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + + # Loft MUST be suitable + cavity_age = None + if ( + eligibility.walls["is_cavity_wall"] and + eligibility.walls["is_filled_cavity"] and + eligibility.loft["suitability"] and + eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + ): + # We check the age of the cavity and if it's particularly old, we flag it + cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) + + # Full checks + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + if eligibility.epc["uprn"] in ["", None]: + eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1]) + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["row_id"], + "uprn": eligibility.epc["uprn"], + "Address": property_meta["Address"], + "Postcode": property_meta["Postcode"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + "cavity_age": cavity_age, + **eligibility.walls, + **eligibility.roof, + } + ) + + scoring_df = pd.DataFrame(scoring_data) + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + scoring_df["UPRN"] = scoring_df["UPRN"].astype(int) + + model_api = ModelApi(portfolio_id="ha24-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + return results_df, scoring_data, nodata + + +def analyse_results(results_df, data, survey_list): + analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge( + results_df, how="left", on="row_id" + ).merge( + survey_list[["survey_key", survey_list.columns[0]]].rename(columns={survey_list.columns[0]: "funding_scheme"}), + how="left", on="survey_key" + ) + + # NEW + + analysis_data["roof_insulation_thickness"] = np.where( + pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"] + ) + analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply( + lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True) + ) + + warmfront_sold_eco4 = analysis_data[ + (analysis_data["warmfront_identified"] == True) & ( + analysis_data["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"])) + ] + + warmfront_sold_gbis = analysis_data[ + (analysis_data["warmfront_identified"] == True) & ( + analysis_data["funding_scheme"].isin(["ECO4 GBIS (ECO+)"])) + ] + # 1407 + + additional_eco4_warmfront_not_sold = analysis_data[ + (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & ( + analysis_data["roof_insulation_thickness_numeric"] <= 100) + ] + + additional_gbis_warmfront_not_sold = analysis_data[ + (analysis_data["gbis_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & ( + ~analysis_data["row_id"].isin(additional_eco4_warmfront_not_sold["row_id"].values) + ) + ] + + additional_gbis_warmfront_not_sold["walls"].value_counts() + analysis_data["walls"].value_counts() + + # END NEW + + all_identified_eco = analysis_data[ + (analysis_data["warmfront_identified"] & analysis_data["funding_scheme"].isin( + ["ECO4 A/W"])) | + (analysis_data["eco4_eligible"]) + ] + + all_identified_gbis = analysis_data[ + (analysis_data["warmfront_identified"] & analysis_data["funding_scheme"].isin( + ["ECO4 GBIS (ECO+)"])) | + (analysis_data["gbis_eligible"] & analysis_data["eco4_eligible"].isin([False, None])) + ] + + warmfront_identified = analysis_data[analysis_data["warmfront_identified"]] + + # Of the ECO jobs, what proportion to we get right + warmfront_identified_eco = warmfront_identified[ + warmfront_identified["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"]) + ] + + eco_success_rate = warmfront_identified_eco["eco4_eligible"].sum() / warmfront_identified_eco.shape[0] + + warmfront_identified_gbis = warmfront_identified[ + warmfront_identified["funding_scheme"].isin(["ECO4 GBIS (ECO+)"]) + ] + + # No gbis for this + # gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0] + + # Additional identified + additional_identified_eco = analysis_data[ + (analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) + ] + + additional_identified_eco["eligibility_classification"].value_counts() + + additional_identified_gbis = analysis_data[ + (analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & ( + analysis_data["warmfront_identified"] == False + ) + ].shape[0] + # Future + additional_identified_eco_future = analysis_data[ + (analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False) + ].shape[0] + additional_identified_gbis_future = analysis_data[ + (analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & ( + analysis_data["warmfront_identified"] == False + ) + ].shape[0] + + +def app(): + data, survey_list = load_data() + + data["row_id"] = ["ha24_" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + results_df, scoring_data, nodata = get_epc_data( + data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + ) + + # Pickle results just in case + # import pickle + # with open("ha24_10_jan.pickle", "wb") as f: + # pickle.dump( + # { + # "scoring_data": scoring_data, + # "results": results_df, + # "nodata": nodata + # }, f + # ) + + # Read in pickle + # import pickle + # with open("ha24_10_jan.pickle", "rb") as f: + # saved = pickle.load(f) + # scoring_data = saved["scoring_data"] + # results_df = saved["results"] + # nodata = saved["nodata"] diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py new file mode 100644 index 00000000..7dd36726 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha25_app.py @@ -0,0 +1,883 @@ +import os +import msgpack +import openpyxl +from pathlib import Path +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3 +from utils.logger import setup_logger +from dotenv import load_dotenv +from utils.s3 import read_dataframe_from_s3_parquet +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +import re + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_data(): + workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True) + sheet = workbook.active + + rows_data = [] + rows_colors = [] + for row in sheet.iter_rows(min_row=1, values_only=True): # use values_only=True to get values + + row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values + rows_data.append(row_data) + + # Headers are on the final row. Pop them off and store them and then remove them from rows_data + headers = rows_data.pop() + # The postcode header is None, so we replace it with "postcode" + headers[-1] = "postcode" + + # Handle colours separately + for row in sheet.iter_rows(min_row=1, values_only=False): + # Assume first cell color is indicative of entire row + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + rows_colors.append(row_color) + + # Remove the final row of colours, which is the header + rows_colors.pop() + + asset_list = pd.DataFrame(rows_data, columns=headers) + asset_list['row_color'] = rows_colors + + asset_list["row_colour_name"] = np.where( + asset_list["row_color"] == "FFFF0000", "red", + np.where(asset_list["row_color"] == "FF00B050", "green", "yellow") + ) + + asset_list["row_colour_code"] = np.where( + asset_list["row_colour_name"] == "red", "does not meet criteria", + np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future") + ) + + asset_list["address"] = asset_list["T1_Address"].copy().str.lower() + asset_list["address"] = asset_list["address"].str.replace("flat", "") + asset_list["address"] = asset_list["address"].str.strip() + + split_addresses = asset_list['address'].str.split(' ', expand=True) + split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7', + 'address8', + 'address9', 'address10', 'address11', 'address12', 'address13', 'address14', ] + split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "") + + # We could re-concatenate but we only care about HouseNo for the moment + asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1) + asset_list["postcode"] = asset_list["postcode"].str.strip() + + # We analysis historical ECO3 survey list + eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx') + eco3_survey_sheet = eco3_survey_workbook["CAVITY"] + + eco3_survey_rows = [] + eco3_survey_colors = [] + + for row in eco3_survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + eco3_survey_rows.append(row_data) + eco3_survey_colors.append(row_color) + + # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically + eco3_survey_list = pd.DataFrame(eco3_survey_rows, columns=[cell.value for cell in eco3_survey_sheet[1]]) + eco3_survey_list["row_colour"] = eco3_survey_colors + # Remove rows where street name is missing + eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])] + # We need to parse the row colours + # We have the following mappings: + # FF7030A0: purple + # FF92D050: green + # FFFF0000: red + # FFFFFF00: yellow + # FF38FD23: green + eco3_survey_list["row_colour_name"] = np.where( + eco3_survey_list["row_colour"] == "FF7030A0", "purple", + np.where(eco3_survey_list["row_colour"] == "FF92D050", "green", + np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red", + np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow", + np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown") + ) + ) + ) + ) + + # We map the meaning: + # red: cancelled + # green: installed advised install complete + # purple: installer advised install complete + post works EPC + # yellow: filler row - drop + eco3_survey_list["row_colour_code"] = np.where( + eco3_survey_list["row_colour_name"] == "red", "cancelled", + np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete", + np.where(eco3_survey_list["row_colour_name"] == "purple", + "installer advised install complete + post works EPC", + np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown") + ) + ) + ) + + # This is good enough for the indicative cancellation rates + + # We now read in the indicative survey list which identified pospects for ECO4 works + eco4_survey_workbook = openpyxl.load_workbook( + f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx' + ) + eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"] + + eco4_prospects_survey_rows = [] + eco4_prospects_survey_colors = [] + + for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + eco4_prospects_survey_rows.append(row_data) + eco4_prospects_survey_colors.append(row_color) + + # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically + eco4_prospects_survey_list = pd.DataFrame( + eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]] + ) + eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors + + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower() + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip() + + eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])] + eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))] + + # Correct some errors in the survey list + eco4_prospects_survey_list["POSTCODE"] = np.where( + (eco4_prospects_survey_list["ADDRESS 1"] == "berry park") & + (eco4_prospects_survey_list["POSTCODE"] == "PL12 6HP"), + "PL12 6EN", + eco4_prospects_survey_list["POSTCODE"] + ) + + # Remove semi colons from address in asset and survey list + asset_list["T1_Address"] = asset_list["T1_Address"].str.replace(";", "") + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace(";", "") + + # In the prosepcts survey list, we have 6 WALKHAM MEADOWS listed twice, which should be 6a and 6b + eco4_prospects_survey_list.loc[838, "NO"] = "6a" + eco4_prospects_survey_list.loc[839, "NO"] = "6b" + + # 3, 7, 9 BOLDVENTURE ROAD should be BOLDVENTURE CLOSE + eco4_prospects_survey_list["ADDRESS 1"] = np.where( + (eco4_prospects_survey_list["ADDRESS 1"] == "boldventure road") & + (eco4_prospects_survey_list["NO"].isin([3, 7, 9])), + "boldventure close", + eco4_prospects_survey_list["ADDRESS 1"] + ) + + eco4_prospects_survey_list["ADDRESS 1"] = np.where( + (eco4_prospects_survey_list["ADDRESS 1"] == "old farm road") & ( + eco4_prospects_survey_list["POSTCODE"] == "PL5 1EP"), + "old school road", + eco4_prospects_survey_list["ADDRESS 1"] + ) + + eco4_prospects_survey_list["ADDRESS 1"] = np.where( + (eco4_prospects_survey_list["ADDRESS 1"] == "croft orchard") & ( + eco4_prospects_survey_list["POSTCODE"] == "TQ12 6RP") & ( + eco4_prospects_survey_list["NO"] == 52), + "drum way", + eco4_prospects_survey_list["ADDRESS 1"] + ) + + # String replace + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace( + "the gulls, collaton road", "the gulls collaton road" + ) + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.replace( + "crows-an-eglose", "crows-an-eglos" + ) + + # We have a high volume of rows that do not match + matched = [] + nomatch = [] + for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)): + + # Not in the asset list + if (row["ADDRESS 1"] == "berry park") and row["NO"] in [40, 42] and row["POSTCODE"] == "PL12 6EN": + nomatch.append(row.to_dict()) + continue + + # Not in the asset list + if (row["ADDRESS 1"] == "roberts road") and row["NO"] == 23 and row["POSTCODE"] == "PL5 1DP": + nomatch.append(row.to_dict()) + continue + + # Not in the asset list + if row["ADDRESS 1"] in [ + "kaynton mead", "broadmoor lane", "hoopers barton", "ecos court", "selwood road", + "castle street" + ]: + nomatch.append(row.to_dict()) + continue + + house_number = row["NO"] + if isinstance(house_number, str): + house_number = house_number.lower() + + if "flat" in house_number: + house_number = house_number.split("flat")[1].strip() + + # Filter on the first line of the address + df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy() + if house_number is not None: + if df.shape[0] != 1: + df = df[df["T1_Address"].str.lower().str.contains(str(house_number))] + if df.shape[0] != 1: + if house_number is not None: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + if row["POSTCODE"] is not None: + df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())] + if df.shape[0] != 1: + nomatch.append(row.to_dict()) + continue + + matched.append( + { + "survey_key": row["survey_key"], + "matched_address": df["T1_Address"].values[0], + "survey_house_no": row["NO"], + "survey_street_name": row["ADDRESS 1"], + "survey_postcode": row["POSTCODE"], + } + ) + + nomatch = pd.DataFrame(nomatch) + matched = pd.DataFrame(matched) + + matched["warmfront_identified"] = True + + # Combine asset list and surveys + data = asset_list.merge( + matched, how="left", left_on="T1_Address", right_on="matched_address", + ) + data["warmfront_identified"] = data["warmfront_identified"].fillna(False) + + lost_identified_properties = eco4_prospects_survey_list[ + ~eco4_prospects_survey_list["survey_key"].isin(matched["survey_key"]) + ] + + return data, eco4_prospects_survey_list, lost_identified_properties + + +def map_year_to_age_band(year): + try: + year = int(year) + except ValueError: + return "Invalid Year" # Or any other way you want to handle invalid inputs + + if year < 1900: + return "England and Wales: before 1900" + elif 1900 <= year <= 1929: + return "England and Wales: 1900-1929" + elif 1930 <= year <= 1949: + return "England and Wales: 1930-1949" + elif 1950 <= year <= 1966: + return "England and Wales: 1950-1966" + elif 1967 <= year <= 1975: + return "England and Wales: 1967-1975" + elif 1976 <= year <= 1982: + return "England and Wales: 1976-1982" + elif 1983 <= year <= 1990: + return "England and Wales: 1983-1990" + elif 1991 <= year <= 1995: + return "England and Wales: 1991-1995" + elif 1996 <= year <= 2002: + return "England and Wales: 1996-2002" + elif 2003 <= year <= 2006: + return "England and Wales: 2003-2006" + elif 2007 <= year <= 2011: + return "England and Wales: 2007-2011" + else: # Assuming all remaining years are 2012 onwards + return "England and Wales: 2012 onwards" + + +def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds): + scoring_data = [] + results = [] + nodata = [] + + property_type_lookup = { + "Flat": {"property-type": "Flat", "built-form": None}, + "Mid Terrace House": {"property-type": "House", "built-form": "Mid-Terrace"}, + "End Terrace House": {"property-type": "House", "built-form": "End-Terrace"}, + "Maisonnette": {"property-type": "Flat", "built-form": None}, + "Semi Detached House": {"property-type": "House", "built-form": "Semi-Detached"}, + "Detached House": {"property-type": "House", "built-form": "Detached"}, + "Coach House": {"property-type": "House", "built-form": "Detached"}, + "Bungalow": {"property-type": "Bungalow", "built-form": None}, + "Detached Bungalow": {"property-type": "Bungalow", "built-form": "Detached"}, + "House": {"property-type": "House", "built-form": None}, + "Semi Detached Bung": {"property-type": "Bungalow", "built-form": "Semi-Detached"}, + "Bedspace": {"property-type": None, "built-form": None}, + "Office Buildings": {"property-type": None, "built-form": None}, + "End Terrace Bungalow": {"property-type": "Bungalow", "built-form": "End-Terrace"}, + "Mid Terrace Bungalow": {"property-type": "Bungalow", "built-form": "Mid-Terrace"}, + "Bedsit": {"property-type": "Flat", "built-form": None}, + "Mid Terrace Housekeeping": {"property-type": "House", "built-form": "Mid-Terrace"}, + "Mid Terrace Housekeeping ": {"property-type": "House", "built-form": "Mid-Terrace"}, + "End Terrace Housex": {"property-type": "House", "built-form": "End-Terrace"}, + "Guest Room": {"property-type": None, "built-form": None} + } + + for _, property_meta in tqdm(data, total=len(data)): + + searcher = SearchEpc( + address1=property_meta["HouseNo"], + postcode=property_meta["postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=property_meta["address"] + ) + searcher.ordnance_survey_client.property_type = property_type_lookup[property_meta["T1_AssetType"]][ + "property-type"] + searcher.ordnance_survey_client.built_form = property_type_lookup[property_meta["T1_AssetType"]]["built-form"] + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + + if searcher.newest_epc.get("estimated"): + # We insert the row ID as our proxy for UPRN + proxy_uprn = int(property_meta["row_id"].split("_")[1]) + searcher.newest_epc["uprn"] = proxy_uprn + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + # We also want to get the penultimate epc + # penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + # if not penultimate_epc: + # penultimate_epc = newest_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + # if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront): + # eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) + # eligibility.check_gbis_warmfront() + # eligibility.check_eco4_warmfront() + # # If this is the case, we need to update the older epcs + # # We don't update just to make data cleaning easier + # if penultimate_epc.get("estimated") is None: + # older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + + # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity + + # Loft MUST be suitable + cavity_age = None + if ( + eligibility.walls["is_cavity_wall"] and + eligibility.walls["is_filled_cavity"] and + eligibility.loft["suitability"] and + eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + ): + # We check the age of the cavity and if it's particularly old, we flag it + cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) + + # Full checks + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + if eligibility.epc["uprn"] in ["", None]: + eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1]) + + if eligibility.epc["construction-age-band"] in ["", None]: + eligibility.epc["construction-age-band"] = map_year_to_age_band(property_meta["Build Yr"]) + + # This is not the right place to do this but this is temp + if eligibility.epc["extension-count"] in ["", None]: + eligibility.epc["extension-count"] = 0 + + # Not in the right place but temp + if eligibility.epc["built-form"] in ["", None]: + if not older_epcs: + eligibility.epc["built-form"] = "Mid-Terrace" + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds, + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["row_id"], + "uprn": eligibility.epc["uprn"], + "Address": property_meta["T1_Address"], + "Postcode": property_meta["postcode"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + "cavity_age": cavity_age, + **eligibility.walls, + **eligibility.roof, + } + ) + + scoring_df = pd.DataFrame(scoring_data) + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + scoring_df["UPRN"] = scoring_df["UPRN"].astype(int) + + model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + return results_df, scoring_data, nodata + + +def get_epc_data_for_lost_surveys( + lost_identified_properties, cleaned, cleaning_data, created_at, photo_supply_lookup, + floor_area_decile_thresholds +): + lost_identified_properties["row_id"] = [ + "lost_surveys_ha25_" + str(i) for i in range(0, len(lost_identified_properties)) + ] + + scoring_data = [] + results = [] + nodata = [] + + property_type_lookup = { + "MID-TERRACE": {"property-type": "House", "built-form": "Mid-Terrace"}, + "N/A": {"property-type": "House", "built-form": None}, + "END-TERRACE": {"property-type": "House", "built-form": "End-Terrace"}, + "GROUND-FLOOR": {"property-type": "House", "built-form": None}, + "TOP-FLOOR": {"property-type": "House", "built-form": None}, + "SEMI-DETACHED": {"property-type": "House", "built-form": "Semi-Detached"}, + "MID-FLOOR": {"property-type": "House", "built-form": None}, + "TOP-FLOOR FLAT": {"property-type": "House", "built-form": None}, + "DETACHED": {"property-type": "House", "built-form": "Detached"}, + "MID-FLOOR FLAT": {"property-type": "House", "built-form": None}, + "SEMI- DETACHED": {"property-type": "House", "built-form": "Semi-Detached"}, + "NO EPC ON GOV": {"property-type": "House", "built-form": None}, + "Top-floor flat": {"property-type": "House", "built-form": None}, + "GROUND-FLOOR FLAT": {"property-type": "House", "built-form": None}, + "NOT ON GOV SITE": {"property-type": "House", "built-form": None} + } + + for _, property_meta in tqdm(lost_identified_properties.iterrows(), total=len(lost_identified_properties)): + + if property_meta["POSTCODE"] is None: + continue + + full_address = ", ".join( + [str(x) for x in [ + property_meta["NO"], property_meta["ADDRESS 1"], property_meta["ADDRESS 2"], property_meta["ADDRESS 3"] + ] if x is not None] + ) + + searcher = SearchEpc( + address1=str(property_meta["NO"]), + postcode=property_meta["POSTCODE"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + full_address=full_address + ) + + property_type_key = property_meta["PROPERTY TYPE"] + if property_type_key is not None: + searcher.ordnance_survey_client.property_type = property_type_lookup[property_type_key.strip()][ + "property-type"] + searcher.ordnance_survey_client.built_form = property_type_lookup[property_type_key.strip()][ + "built-form"] + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + + if searcher.newest_epc.get("estimated"): + # We insert the row ID as our proxy for UPRN + proxy_uprn = int(property_meta["row_id"].split("_")[-1]) + searcher.newest_epc["uprn"] = proxy_uprn + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + # We also want to get the penultimate epc + penultimate_epc, _ = searcher.filter_newest_epc(older_epcs) + if not penultimate_epc: + penultimate_epc = newest_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront): + eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + # If this is the case, we need to update the older epcs + # We don't update just to make data cleaning easier + if penultimate_epc.get("estimated") is None: + older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + + # Full checks + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"] & (eligibility.epc["construction-age-band"] not in ["", None]): + if eligibility.epc["uprn"] in ["", None]: + eligibility.epc["uprn"] = int(property_meta["row_id"].split("_")[1]) + + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds, + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "row_id": property_meta["row_id"], + "uprn": eligibility.epc["uprn"], + "Address": property_meta["ADDRESS 1"], + "Postcode": property_meta["POSTCODE"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + **eligibility.walls, + **eligibility.roof, + } + ) + + scoring_df = pd.DataFrame(scoring_data) + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + scoring_df["UPRN"] = scoring_df["UPRN"].astype(int) + + model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + return results_df, scoring_data, nodata + + +def analyse_results(results_df, data, eco4_prospects_survey_list): + analysis_data = data[["row_id", "survey_key", "warmfront_identified"]].merge( + results_df, how="left", on="row_id" + ) + + analysis_data = analysis_data.merge( + eco4_prospects_survey_list[["survey_key", "ADDRESS 1", "NO", "POSTCODE"]], + how="left", on="survey_key" + ) + + # NEW + analysis_data["roof_insulation_thickness"] = np.where( + pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"] + ) + analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply( + lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True) + ) + + warmfront_identified = analysis_data[ + (analysis_data["warmfront_identified"] == True) + ] # 2204 + + # Because we don't know which property is for which scheme, we'll just look at what we found + ideal_eco4 = analysis_data[ + (analysis_data["eco4_eligible"] == True) & + (analysis_data["roof_insulation_thickness_numeric"] <= 100) & + (analysis_data["sap"] <= 54) + ] # 335 + + gbis = analysis_data[ + (analysis_data["gbis_eligible"] == True) & + ~analysis_data["row_id"].isin(ideal_eco4["row_id"].values) + ] + + ideal_eco4 = ideal_eco4[ideal_eco4["sap"] <= 54] + + +def analyse_lost_surveys(results_df): + results_df["roof_insulation_thickness"] = np.where( + pd.isnull(results_df["roof_insulation_thickness"]), None, results_df["roof_insulation_thickness"] + ) + results_df["roof_insulation_thickness_numeric"] = results_df["roof_insulation_thickness"].apply( + lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True) + ) + + ideal_eco4 = results_df[ + (results_df["eco4_eligible"] == True) & + (results_df["roof_insulation_thickness_numeric"] <= 100) & + (results_df["sap"] <= 54) + ] # 25 + + gbis = results_df[ + (results_df["gbis_eligible"] == True) & + ~results_df["row_id"].isin(ideal_eco4["row_id"].values) + ] # 82 + + +def app(): + data, eco4_prospects_survey_list, lost_identified_properties = load_data() + + data["row_id"] = ["ha25_" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + results_df, scoring_data, nodata = get_epc_data( + data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + ) + # Pickle the outputs + # Old data was ha25.pickle + # import pickle + # with open("ha25_10_jan.pickle", "wb") as f: + # pickle.dump( + # { + # "results_df": results_df, + # "scoring_data": scoring_data, + # "nodata": nodata + # }, + # f + # ) + + # Load in pickle + import pickle + with open("ha25_10_jan.pickle", "rb") as f: + saved = pickle.load(f) + results_df = saved["results_df"] + scoring_data = saved["scoring_data"] + nodata = saved["nodata"] diff --git a/etl/eligibility/ha_15_32/ha33_app.py b/etl/eligibility/ha_15_32/ha33_app.py index 9af5eae2..42c8fa81 100644 --- a/etl/eligibility/ha_15_32/ha33_app.py +++ b/etl/eligibility/ha_15_32/ha33_app.py @@ -264,21 +264,21 @@ def get_ha_33data(data, cleaned, cleaning_data, created_at): def analyse_ha_33(results_df, data): - results_df_social = results_df[results_df["tenure"] == "Rented (social)"] + # results_df_social = results_df[results_df["tenure"] == "Rented (social)"] + # + # results_df_social["tenure"].value_counts() - results_df_social["tenure"].value_counts() + data[data["row_id"].isin(results_df["row_id"].values)]["PROPERTY TYPE"].value_counts() - data[data["row_id"].isin(results_df_social["row_id"].values)]["PROPERTY TYPE"].value_counts() + n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum() + n_eco4 = results_df["eco4_eligible"].sum() + n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum() - n_identified = (results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]).sum() - n_eco4 = results_df_social["eco4_eligible"].sum() - n_gbis = results_df_social[~results_df_social["eco4_eligible"]]["gbis_eligible"].sum() - - eco_eligibile = results_df_social[results_df_social["eco4_eligible"]] + eco_eligibile = results_df[results_df["eco4_eligible"]] eco_eligibile["walls"].value_counts() eco_eligibile["roof"].value_counts() - results_df_social[results_df_social["gbis_eligible"] | results_df_social["eco4_eligible"]]["tenure"].value_counts() + results_df[results_df["gbis_eligible"] | results_df["eco4_eligible"]]["tenure"].value_counts() results_df_social["eligibility_classification"].value_counts() @@ -316,3 +316,11 @@ def app(): created_at = datetime.now().isoformat() results_df, _, _ = get_ha_33data(data, cleaned, cleaning_data, created_at) + + # Read in + import pickle + with open("ha33_results.pickle", "rb") as f: + data = pickle.load(f) + results_df = pd.DataFrame(data["results"]) + scoring_data = data["scoring_data"] + nodata = data["nodata"] diff --git a/etl/eligibility/ha_15_32/ha4_app.py b/etl/eligibility/ha_15_32/ha4_app.py new file mode 100644 index 00000000..d2702dd8 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha4_app.py @@ -0,0 +1,328 @@ +import os +import msgpack +from pathlib import Path +from datetime import datetime +import numpy as np +import pandas as pd +from utils.s3 import read_from_s3 +from utils.logger import setup_logger +from dotenv import load_dotenv +from utils.s3 import read_dataframe_from_s3_parquet +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +import re + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_ha_4(): + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + data = pd.read_csv(f"etl/eligibility/ha_15_32/HA 4 Asset List.csv", low_memory=False) + return data + + +def standardise_ha_4(data): + # Location name contains some strings like {0664} which we remove + data['Location Name'] = data['Location Name'].str.replace('\{.*?\}', '', regex=True) + + # Trim whitespace from either end of location name + data["Location Name"] = data["Location Name"].str.strip() + + # Remove any unusable postcodes + data = data[data["Post Code"] != '\\\\'].copy() + + # Some specific replacements + data["Location Name"] = np.where( + data["Location Name"] == "Calderbrook Pl & Cog La", + "Calderbrook Place", + data["Location Name"] + ) + + return data + + +def get_ha_4_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds): + scoring_data = [] + results = [] + nodata = [] + for _, property_meta in tqdm(data.iterrows(), total=len(data)): + # For many of the entries in this dataset, we're actually given an entire building, so we EPCs for every + # building + searcher = SearchEpc( + address1=property_meta["Address Line 1"], + postcode=property_meta["Post Code"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + property_type=property_type_lookup.get(house["Archetype"]), + ) + + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + searcher = SearchEpc( + address1=property_meta["Location Name"], + postcode=property_meta["Post Code"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + property_type=property_type_lookup.get(house["Archetype"]), + ) + searcher.search() + + if searcher.newest_epc is None: + nodata.append(house["row_id"]) + continue + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + + searcher.search() + + if searcher.data is None: + nodata.append(property_meta.to_dict()) + continue + + epcs = searcher.data["rows"] + epcs = pd.DataFrame(epcs) + + # Take the newest EPC by UPRN + epcs = epcs.sort_values(by=["lodgement-date"], ascending=False) + newest_epcs = epcs.drop_duplicates(subset=["uprn"], keep="first") + + # For each EPC, we now check eligibility + for _, epc in newest_epcs.iterrows(): + eligibility = Eligibility(epc=epc.to_dict(), cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + # If the house is not identified, we do a full gbis and eco4 check + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + # We get old_eps + old_data = epcs[ + (epcs["uprn"] == epc["uprn"]) & + (epcs["lmk-key"] != epc["lmk-key"]) + ].to_dict("records") + + full_sap_epc = epcs[ + (epcs["uprn"] == epc["uprn"]) & + (epcs["transaction-type"] == "new dwelling") + ].to_dict("records") + + scoring_dictionary = prepare_model_data_row( + property_id=eligibility.epc["uprn"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=old_data, + full_sap_epc=full_sap_epc + ) + scoring_data.extend(scoring_dictionary) + + results.append( + { + "uprn": epc["uprn"], + "Location Name": property_meta["Location Name"], + "Post Code": property_meta["Post Code"], + "property_type": eligibility.epc["property-type"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "cavity_type": eligibility.cavity["type"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + } + ) + + scoring_df = pd.DataFrame(scoring_data) + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + + model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "uprn"}).merge( + results_df[["uprn", "sap"]], how="left", on="uprn" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("uprn")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "uprn"]], + how="left", + on="uprn" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + results_df = results_df[~pd.isnull(results_df["uprn"])] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "uprn": row["uprn"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="uprn" + ) + # We have some properties that are duplicated so we take just one instance + results_df = results_df.drop_duplicates(subset=["uprn"]) + + return results_df, scoring_data, nodata + + +def analyse_ha_4(results_df, data): + n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum() + n_eco4 = results_df["eco4_eligible"].sum() + n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum() + + eco_eligibile = results_df[results_df["eco4_eligible"]] + eco_eligibile["eligibility_classification"].value_counts() + + future_possibilities_eco = results_df[ + (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) + ].copy() + + future_possibilities_gbis = results_df[ + (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & ( + ~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) + ].copy() + + total_future_possibilities = future_possibilities_eco.shape[0] + future_possibilities_gbis.shape[0] + + +def app(): + data = load_ha_4() + + data = standardise_ha_4(data) + + data["row_id"] = ["h4" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat() + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + results_df, scoring_data, nodata = get_ha_4_data( + data=data, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + + # Store the data locally as a pickle + # import pickle + # with open("ha_4.pickle", "wb") as f: + # pickle.dump( + # { + # "results_df": results_df, + # "scoring_data": scoring_data, + # "nodata": nodata + # }, f) + + # Read in + # import pickle + # with open("ha_4.pickle", "rb") as f: + # data = pickle.load(f) + # results_df = data["results_df"] + # scoring_data = data["scoring_data"] + # nodata = data["nodata"] diff --git a/etl/eligibility/ha_15_32/ha7_app.py b/etl/eligibility/ha_15_32/ha7_app.py new file mode 100644 index 00000000..c6486159 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha7_app.py @@ -0,0 +1,383 @@ +import os +import msgpack +import openpyxl +from openpyxl.styles.colors import COLOR_INDEX +from pathlib import Path +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet +from utils.logger import setup_logger +from dotenv import load_dotenv +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi +from etl.solar.SolarPhotoSupply import SolarPhotoSupply +from recommendations.recommendation_utils import calculate_cavity_age +from recommendation_utils import convert_thickness_to_numeric + +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +OS_API_KEY = os.getenv("ORDNANCE_SURVEY_API_KEY") + + +def load_data(): + """ + Load the data from the excel + """ + + workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 7 ASSET LIST.xlsx') + sheet = workbook.active + + # Prepare lists to collect rows data and their colors + rows_data = [] + rows_colors = [] + for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + row_color = COLOR_INDEX[row_color] + rows_data.append(row_data) + rows_colors.append(row_color) + + df = pd.DataFrame(rows_data, columns=[cell.value for cell in sheet[1]]) + + # Add the row colors as a new column + df['row_color'] = rows_colors + df.columns.values[8] = "is_active" + + # Remove None columns + df = df.dropna(axis=1, how='all') + # We now parse the colours + df["row_color"].unique() + df["row_colour_name"] = np.where( + df["row_color"] == "0000FFFF", "red", + np.where(df["row_color"] == "00FF00FF", "green", "yellow") + ) + df["row_code"] = np.where( + df["row_colour_name"] == "red", "invalid", + np.where(df["row_colour_name"] == "green", "potential ECO4", "needs criteria change") + ) + + return df + + +def get_ha7_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds): + property_type_lookup = { + # "Mid Terrace": "Mid-Terrace", + # "End Terrace": "End-Terrace", + # "Semi Detached": "Semi-Detached", + # "Detached": "Detached", + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + } + + scoring_data = [] + results = [] + nodata = [] + for _, house in tqdm(data.iterrows(), total=len(data)): + + if house["Address"]: + address = house["Address"] + else: + address = house["Address2"] + + searcher = SearchEpc( + address1=address, + postcode=house["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None, + property_type=property_type_lookup.get(house["Archetype"]), + ) + + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(house["row_id"]) + continue + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity + + # Loft MUST be suitable + cavity_age = None + if ( + eligibility.walls["is_cavity_wall"] and + eligibility.walls["is_filled_cavity"] and + eligibility.loft["suitability"] and + eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + ): + # We check the age of the cavity and if it's particularly old, we flag it + cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) + + # If the house is not identified, we do a full gbis and eco4 check + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + scoring_dictionary = prepare_model_data_row( + property_id=house["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + scoring_data.extend(scoring_dictionary) + + # If nothing is eligible or gbis is eligible, then we make a record this + results.append( + { + "row_id": house["row_id"], + "address": house["Address"], + "postcode": house["Postcode"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + **newest_epc, + "cavity_age": cavity_age, + **eligibility.walls, + **eligibility.roof, + } + ) + + scoring_df = pd.DataFrame(scoring_data) + # Implement the same process that is being used in the recommendation engine to cleaning scoring_df + + # Perform the same cleaning as in the model - first clean number of room variables though + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'], + colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], + ) + + scoring_df = DataProcessor.apply_averages_cleaning( + data_to_clean=scoring_df, + cleaning_data=cleaning_data, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"], + ).drop(columns=["LOCAL_AUTHORITY"]) + + scoring_df = DataProcessor.clean_missings_after_description_process( + scoring_df, + ignore_cols=[c for c in scoring_df.columns if ("thermal_transmittance" in c) or ( + "insulation_thickness" in c) or ("ENERGY_EFF" in c)] + ) + + scoring_df = DataProcessor.clean_efficiency_variables(scoring_df) + + model_api = ModelApi(portfolio_id="ha33-eligibility", timestamp=created_at) + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" + } + ) + + predictions = all_predictions["sap_change_predictions"].copy() + + results_df = pd.DataFrame(results) + + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + + return results_df, scoring_data, nodata + + +def analyse_ha_7(results_df, data): + analysis_data = results_df.merge( + data[["row_id", "row_code", "Property Type", "Construction Year Band"]], how="left", on="row_id" + ) + + analysis_data["row_code"].value_counts() + + # NEW + + analysis_data["roof_insulation_thickness"] = np.where( + pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"] + ) + analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply( + lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True) + ) + + ideal_eco4 = analysis_data[ + (analysis_data["eco4_eligible"] == True) & ( + analysis_data["roof_insulation_thickness_numeric"] <= 100) + ] + + secondary_eco4_warmfront_not_sold = analysis_data[ + (analysis_data["eco4_eligible"] == True) & ( + analysis_data["roof_insulation_thickness_numeric"] > 100) + ] + + # underperforming cavities + underperforming_cavities = analysis_data[ + (analysis_data["eco4_message"] == "Failed due to full cavity - check cavity age") & ( + analysis_data["cavity_age"] > 9 * 365 + ) & (analysis_data["roof_insulation_thickness_numeric"] <= 100) + ] + + identified_gbis_not_sold = analysis_data[ + (analysis_data["gbis_eligible"] == True) & ( + analysis_data["eco4_eligible"] == False + ) + ] + + wf_identified = analysis_data[ + (analysis_data["row_code"] == "potential ECO4") + ] + + # END NEW + + warmfront_identification = analysis_data["row_code"].value_counts() + warmfront_identified = analysis_data[analysis_data["row_code"] == "potential ECO4"] + warmfront_identified["walls"].value_counts(normalize=True) + + analysis_data["Construction Year Band"].value_counts(normalize=True) + + # Number of days from today + + days_to_today = (datetime.now() - pd.to_datetime(warmfront_identified["date_epc"])).dt.days + days_to_today.mean() + + property_types = analysis_data["Property Type"].value_counts() + + n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum() + + eco_identified = results_df[results_df["eco4_eligible"]] + n_eco4 = eco_identified["eco4_eligible"].sum() + gbis_identified = results_df[~results_df["eco4_eligible"] & results_df["gbis_eligible"]] + n_gbis = results_df[~results_df["eco4_eligible"]]["gbis_eligible"].sum() + + eco_eligibile = results_df[results_df["eco4_eligible"]] + eco_eligibile["eligibility_classification"].value_counts() + + future_possibilities_eco = results_df[ + (results_df["eco4_eligible_future"] == True) & (~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) + ].copy() + + future_possibilities_gbis = results_df[ + (results_df["gbis_eligible_future"] == True) & (results_df["eco4_eligible_future"] == False) & ( + ~(results_df["gbis_eligible"] | results_df["eco4_eligible"])) + ].copy() + + total_future_possibilities = future_possibilities_eco.shape[0] + future_possibilities_gbis.shape[0] + + +def app(): + data = load_data() + data["row_id"] = ["ha7" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + + created_at = datetime.now().isoformat() + + results_df, scoring_data, nodata = get_ha7_data( + data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + ) + + # Pickle results + # import pickle + # with open("ha7_results_jan_10.pkl", "wb") as f: + # pickle.dump({"results_df": results_df, "scoring_data": scoring_data, "nodata": nodata}, f) + + # Read in the old data + # import pickle + # with open("ha7_results_jan_10.pkl", "rb") as f: + # old_data = pickle.load(f) + # results_df = old_data["results_df"] + # scoring_data = old_data["scoring_data"] + # nodata = old_data["nodata"] diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index c9f937c0..801a9456 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -766,12 +766,16 @@ class EPCDataProcessor: how='left' ) + global_averages = cleaning_data[cols_to_clean].mean() + # Fill NaN values with averages for col in cols_to_clean: data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"], inplace=True) data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True) # If we still have missings data_to_clean[col].fillna(data_to_clean[col].mean(), inplace=True) + # Final step if we still have missings - use global mean + data_to_clean[col].fillna(global_averages[col], inplace=True) return data_to_clean diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index b4befcd7..3dc6e39b 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -23,6 +23,12 @@ def main(): pd.DataFrame(epc_pipeline.compiled_all_equal_rows).to_parquet("refactor_datasets/all_equal_rows.parquet") pd.concat(epc_pipeline.compiled_cleaning_averages).to_parquet("refactor_datasets/cleaning_averages.parquet") + from utils.s3 import read_dataframe_from_s3_parquet + dataset = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", + file_key="sap_change_model/dataset_test.parquet", + ) + if __name__ == "__main__": main() diff --git a/etl/epc_clean/epc_attributes/MainheatAttributes.py b/etl/epc_clean/epc_attributes/MainheatAttributes.py index e21f0d37..813e15a6 100644 --- a/etl/epc_clean/epc_attributes/MainheatAttributes.py +++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py @@ -16,6 +16,7 @@ class MainHeatAttributes(Definitions): "solar assisted heat pump", "exhaust source heat pump", "community heat pump", + "portable electric heating" ] FUEL_TYPES = ["electric", "mains gas", "wood logs", "coal", "oil", "wood pellets", "anthracite", "dual fuel mineral and wood", "smokeless fuel", "lpg", "b30k"] diff --git a/etl/epc_clean/epc_attributes/WallAttributes.py b/etl/epc_clean/epc_attributes/WallAttributes.py index bfe600d5..09eac215 100644 --- a/etl/epc_clean/epc_attributes/WallAttributes.py +++ b/etl/epc_clean/epc_attributes/WallAttributes.py @@ -152,4 +152,7 @@ class WallAttributes(Definitions): else: result["insulation_thickness"] = "average" + if result["is_cavity_wall"] & result["is_as_built"] & (result["insulation_thickness"] == "average"): + result["is_filled_cavity"] = True + return result diff --git a/etl/epc_clean/epc_attributes/WindowAttributes.py b/etl/epc_clean/epc_attributes/WindowAttributes.py index e962cd31..ce0b156a 100644 --- a/etl/epc_clean/epc_attributes/WindowAttributes.py +++ b/etl/epc_clean/epc_attributes/WindowAttributes.py @@ -52,7 +52,7 @@ class WindowAttributes(Definitions): raise ValueError('Invalid description') def process(self) -> Dict[str, Union[str, bool]]: - result: Dict[str, Union[str, bool]] = { + result: Dict[str, Union[str, bool, None]] = { "has_glazing": False, "glazing_coverage": None, "glazing_type": None, @@ -80,7 +80,11 @@ class WindowAttributes(Definitions): break # If we didn't find any coverage or type, we assume full coverage - if not result["glazing_coverage"]: + if (not result["glazing_coverage"]) & (result["glazing_type"] != "single"): result["glazing_coverage"] = "full" + # We reset some values if the glazing is single + if result["glazing_type"] == "single": + result["has_glazing"] = False + return result diff --git a/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py b/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py index d264ebff..558b176e 100644 --- a/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py +++ b/etl/epc_clean/tests/test_data/test_mainheat_attributes_cases.py @@ -1652,4 +1652,17 @@ mainheat_cases = [ 'has_electricaire': False, 'has_assumed_for_most_rooms': False, 'has_underfloor_heating': False, "has_electric_heat_pumps": False, "has_micro-cogeneration": False}, + {'original_description': 'Portable electric heating assumed for most rooms', 'has_radiators': False, + 'has_fan_coil_units': False, 'has_pipes_in_screed_above_insulation': False, + 'has_pipes_in_insulated_timber_floor': False, 'has_pipes_in_concrete_slab': False, 'has_boiler': False, + 'has_air_source_heat_pump': False, 'has_room_heaters': False, 'has_electric_storage_heaters': False, + 'has_warm_air': False, 'has_electric_underfloor_heating': False, 'has_electric_ceiling_heating': False, + 'has_community_scheme': False, 'has_ground_source_heat_pump': False, 'has_no_system_present': False, + 'has_portable_electric_heaters': False, 'has_water_source_heat_pump': False, 'has_electric_heat_pump': False, + 'has_micro-cogeneration': False, 'has_solar_assisted_heat_pump': False, 'has_exhaust_source_heat_pump': False, + 'has_community_heat_pump': False, 'has_portable_electric_heating': True, 'has_electric': True, + 'has_mains_gas': False, 'has_wood_logs': False, 'has_coal': False, 'has_oil': False, 'has_wood_pellets': False, + 'has_anthracite': False, 'has_dual_fuel_mineral_and_wood': False, 'has_smokeless_fuel': False, 'has_lpg': False, + 'has_b30k': False, 'has_assumed': True, 'has_electricaire': False, 'has_assumed_for_most_rooms': True, + 'has_underfloor_heating': False} ] diff --git a/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py b/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py index 300702a7..96c545c1 100644 --- a/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py +++ b/etl/epc_clean/tests/test_data/test_wall_attributes_cases.py @@ -550,7 +550,7 @@ wall_cases = [ 'is_as_built': False, 'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': None, 'external_insulation': False, 'internal_insulation': False}, {'original_description': 'Cavity wall, as built, insulated (assumed)', 'thermal_transmittance': None, - 'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': False, 'is_solid_brick': False, + 'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_assumed': True, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'average', 'external_insulation': False, 'internal_insulation': False}, @@ -727,7 +727,7 @@ wall_cases = [ 'external_insulation': False, 'internal_insulation': False}, {'original_description': 'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)', 'thermal_transmittance': None, - 'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': False, 'is_solid_brick': False, + 'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False, 'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': True, 'is_cob': False, 'is_assumed': True, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'average', 'external_insulation': False, 'internal_insulation': False}, diff --git a/etl/epc_clean/tests/test_data/test_window_attributes_cases.py b/etl/epc_clean/tests/test_data/test_window_attributes_cases.py index 1eeeee21..f01ccba9 100644 --- a/etl/epc_clean/tests/test_data/test_window_attributes_cases.py +++ b/etl/epc_clean/tests/test_data/test_window_attributes_cases.py @@ -30,7 +30,8 @@ windows_cases = [ 'glazing_type': 'triple', 'no_data': False}, {'original_description': 'Gwydrau triphlyg rhannol', 'has_glazing': True, 'glazing_coverage': 'partial', 'glazing_type': 'triple', 'no_data': False}, - {'original_description': 'Single glazed', 'has_glazing': True, 'glazing_coverage': 'full', 'glazing_type': 'single', + {'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': None, + 'glazing_type': 'single', 'no_data': False}, {'original_description': 'Some double glazing', 'has_glazing': True, 'glazing_coverage': 'partial', 'glazing_type': 'double', 'no_data': False}, @@ -46,7 +47,8 @@ windows_cases = [ 'glazing_type': 'double', 'no_data': False}, {'original_description': 'Gwydrau dwbl gan mwyaf', 'has_glazing': True, 'glazing_coverage': 'most', 'glazing_type': 'double', 'no_data': False}, - {'original_description': 'Gwydrau sengl', 'has_glazing': True, 'glazing_coverage': 'full', 'glazing_type': 'single', + {'original_description': 'Gwydrau sengl', 'has_glazing': False, 'glazing_coverage': None, + 'glazing_type': 'single', 'no_data': False}, {'original_description': 'Ffenestri perfformiad uchel', 'has_glazing': True, 'glazing_coverage': 'full', 'glazing_type': 'high performance', 'no_data': False}, diff --git a/etl/epc_clean/tests/test_roof_attributes.py b/etl/epc_clean/tests/test_roof_attributes.py index b0663a3e..481beedc 100644 --- a/etl/epc_clean/tests/test_roof_attributes.py +++ b/etl/epc_clean/tests/test_roof_attributes.py @@ -3,12 +3,13 @@ from pathlib import Path from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + # For local testing -if __file__ == "": - input_data_path = Path("./model_data/tests/test_data/EpcClean_inputs.obj") -else: - current_file_path = Path(__file__) - input_data_path = current_file_path.parent / 'test_data' / 'EpcClean_inputs.obj' +# if __file__ == "": +# input_data_path = Path("./model_data/tests/test_data/EpcClean_inputs.obj") +# else: +# current_file_path = Path(__file__) +# input_data_path = current_file_path.parent / 'test_data' / 'EpcClean_inputs.obj' class TestRoofAttributes: @@ -88,7 +89,12 @@ class TestRoofAttributes: def test_clean_roof_no_description(self): roof = RoofAttributes('').process() - assert roof == {} + assert roof == { + 'thermal_transmittance': False, 'thermal_transmittance_unit': False, 'is_pitched': False, + 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False, + 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': False, 'is_valid': False, + 'insulation_thickness': False + } def test_clean_roof_edge_cases(self): # Insulation thickness edge case diff --git a/etl/solar/SolarPhotoSupply.py b/etl/solar/SolarPhotoSupply.py new file mode 100644 index 00000000..180cd6f5 --- /dev/null +++ b/etl/solar/SolarPhotoSupply.py @@ -0,0 +1,244 @@ +import pandas as pd +from tqdm import tqdm +from utils.s3 import save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet +from utils.logger import setup_logger + +logger = setup_logger() + + +class SolarPhotoSupply: + DATASET_COLUMNS = [ + "UPRN", "PROPERTY_TYPE", "TENURE", "BUILT_FORM", "ROOF_DESCRIPTION", "PHOTO_SUPPLY", "TOTAL_FLOOR_AREA", + "CONSTRUCTION_AGE_BAND", "SOLAR_WATER_HEATING_FLAG" + ] + + def __init__(self, file_directories, cleaned_lookup): + """ + Initialize the SolarPhotoSupply class with file directories and a cleaned lookup. Currently, this class + just works with locally stored data, but this could be extended to work with data stored in S3. + + :param file_directories: A list of directories where files are stored. + :param cleaned_lookup: A dictionary containing cleaned lookup data. + """ + self.file_directories = file_directories + + self.results = [] + self.decile_thresholds = None + + self.roof_lookup = pd.DataFrame(cleaned_lookup.get("roof-description")) + + self.photo_supply_lookup = pd.DataFrame() + self.floor_area_decile_thresholds = pd.DataFrame() + + def create_dataset(self): + """ + Create a dataset from the provided file directories. This method processes the data files, + applies transformations, and aggregates data into a useful format. + """ + + if self.roof_lookup.empty: + raise ValueError("No roof lookup data") + + results = [] + + logger.info("Creating solar photo supply dataset") + for dir in tqdm(self.file_directories): + filepath = dir / "certificates.csv" + df = pd.read_csv(filepath, low_memory=False) + df = df[~pd.isnull(df["UPRN"])] + df["UPRN"] = df["UPRN"].astype(int).astype(str) + # Drop rows that have a missing PROPERTY_TYPE, BUILT_FORM, CONSTRUCTION_AGE_BAND, TOTAL_FLOOR_AREA + for col in ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "TOTAL_FLOOR_AREA"]: + df = df[~pd.isnull(df[col])] + # Take newest LODGEMENT_DATE per UPRN + df = df.sort_values(by="LODGEMENT_DATE", ascending=False).drop_duplicates(subset=["UPRN"]) + + data = df[self.DATASET_COLUMNS].copy() + data["PHOTO_SUPPLY"] = data["PHOTO_SUPPLY"].fillna(0) + data = data[data["PHOTO_SUPPLY"] != 0] + results.append(data) + + self.results = pd.concat(results) + + # Convert total floor area to deciles + self.decile_thresholds = self.results["TOTAL_FLOOR_AREA"].quantile( + [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + ).values + + self.results["floor_area_decile"] = pd.cut( + self.results["TOTAL_FLOOR_AREA"], + bins=[0] + list(self.decile_thresholds) + [float('inf')], + labels=False, + include_lowest=True + ) + + # Convert tenure to lower + self.results["TENURE"] = self.results["TENURE"].str.lower() + + self.results = self.results.merge( + self.roof_lookup.drop( + columns=[ + "clean_description", "thermal_transmittance", "thermal_transmittance_unit", "insulation_thickness", + "is_assumed" + ] + ), + left_on="ROOF_DESCRIPTION", + right_on="original_description", + how="left" + ) + + self.photo_supply_lookup = self.results.groupby( + [ + "PROPERTY_TYPE", "BUILT_FORM", "TENURE", "is_pitched", "is_roof_room", "is_flat", + "CONSTRUCTION_AGE_BAND", "floor_area_decile" + ], + observed=True + ).agg( + { + "PHOTO_SUPPLY": ["median", "mean"], + } + ).reset_index() + + self.photo_supply_lookup.columns = ['_'.join(col).strip() for col in self.photo_supply_lookup.columns.values] + # Remove trailing underscore from columns + self.photo_supply_lookup.columns = [ + col[:-1] if col.endswith("_") else col for col in self.photo_supply_lookup.columns.values + ] + # Convert columns to lowercase + self.photo_supply_lookup.columns = [col.lower() for col in self.photo_supply_lookup.columns.values] + + self.floor_area_decile_thresholds = pd.DataFrame( + self.decile_thresholds, + columns=["floor_area_decile_thresholds"] + ) + + @staticmethod + def classify_floor_area(new_area, thresholds): + """ + Classify a given floor area into a decile based on provided thresholds. + + :param new_area: The new floor area to be classified. + :param thresholds: A list of thresholds used for classification. + :return: An integer representing the decile index. + """ + + for i, threshold in enumerate(thresholds): + if new_area <= threshold: + return i # Returns the decile index (0 to 9) + return len(thresholds) + + def save(self): + """ + Save the processed data to an S3 bucket in the parquet format. This method also handles + logging and validation to ensure data is present before saving. + """ + if self.photo_supply_lookup.empty: + raise ValueError("No data to save") + + logger.info("Storing outputs to S3") + # Store this data in s3 as a parquet file + + save_dataframe_to_s3_parquet( + df=self.photo_supply_lookup, + bucket_name="retrofit-data-dev", + file_key="solar_pv_supply/photo_supply_lookup.parquet", + ) + + save_dataframe_to_s3_parquet( + df=self.floor_area_decile_thresholds, + bucket_name="retrofit-data-dev", + file_key=f"solar_pv_supply/floor_area_decile_thresholds.parquet", + ) + + @staticmethod + def load(bucket): + """ + Load datasets from an S3 bucket. + + :param bucket: The name of the S3 bucket to load data from. + :return: A tuple containing photo supply lookup and floor area decile thresholds dataframes. + """ + photo_supply_lookup = read_dataframe_from_s3_parquet( + bucket_name=bucket, file_key="solar_pv_supply/photo_supply_lookup.parquet", + ) + floor_area_decile_thresholds = read_dataframe_from_s3_parquet( + bucket_name=bucket, file_key="solar_pv_supply/floor_area_decile_thresholds.parquet", + ) + + return photo_supply_lookup, floor_area_decile_thresholds + + @classmethod + def filter_photo_supply_lookup( + cls, + photo_supply_lookup: pd.DataFrame, + floor_area_decile_thresholds: pd.DataFrame, + tenure: str, + built_form: str, + property_type: str, + construction_age_band: str, + is_flat: bool, + is_pitched: bool, + is_roof_room: bool, + floor_area: float + ): + + """ + Filter the photo supply lookup to find the most appropriate photo supply for a given property. + :param photo_supply_lookup: The photo supply lookup dataframe. + :param floor_area_decile_thresholds: The floor area decile thresholds dataframe. + :param tenure: The tenure of the property. + :param built_form: The built form of the property. + :param property_type: The property type of the property. + :param construction_age_band: The construction age band of the property. + :param is_flat: Whether the property has a flat roof. + :param is_pitched: Whether the property has a pitched roof. + :param is_roof_room: Whether the property has a roof room. + :param floor_area: The floor area of the property. + :return: + """ + + # Convert the tenure to lower case, as is done in the creation of the dataset + tenure = tenure.lower() + # We remap the "not defined" + tenure = { + "not defined - use in the case of a new dwelling for which the intended tenure in not known. it is not to " + "be used for an existing dwelling": + "not defined - use in the case of a new dwelling for which the intended tenure in not known. it is no" + }.get(tenure, tenure) + + photo_supply_matched = photo_supply_lookup[ + (photo_supply_lookup["tenure"] == tenure) & + (photo_supply_lookup["built_form"] == built_form) & + (photo_supply_lookup["property_type"] == property_type) & + (photo_supply_lookup["construction_age_band"] == construction_age_band) & + (photo_supply_lookup["is_flat"] == is_flat) & + (photo_supply_lookup["is_pitched"] == is_pitched) & + (photo_supply_lookup["is_roof_room"] == is_roof_room) + ] + + if photo_supply_matched.empty: + # There are a small number of cases where we don't get a full match so try again with a more aggregated + # average + photo_supply_matched = photo_supply_lookup[ + (photo_supply_lookup["tenure"] == tenure) & + (photo_supply_lookup["built_form"] == built_form) & + (photo_supply_lookup["property_type"] == property_type) + ] + if construction_age_band in photo_supply_matched["construction_age_band"].values: + photo_supply_matched = photo_supply_matched[ + photo_supply_matched["construction_age_band"] == construction_age_band + ] + + if photo_supply_matched.empty: + raise ValueError("No photo supply matches") + + floor_area_decile = cls.classify_floor_area( + floor_area, floor_area_decile_thresholds["floor_area_decile_thresholds"].values + ) + + if floor_area_decile in photo_supply_matched["floor_area_decile"].values: + photo_supply_matched = photo_supply_matched[ + photo_supply_matched["floor_area_decile"] == floor_area_decile + ] + + return photo_supply_matched diff --git a/etl/solar/app.py b/etl/solar/app.py new file mode 100644 index 00000000..50a3d282 --- /dev/null +++ b/etl/solar/app.py @@ -0,0 +1,31 @@ +from pathlib import Path +from etl.epc.property_change_app import get_cleaned +from etl.solar.SolarPhotoSupply import SolarPhotoSupply + +DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" + + +def app(): + """ + This code reads in the EPC data and attempt to produce a reasonable figure for the photo-supply variable, which + is the following: + "Percentage of photovoltaic area as a percentage of total roof area. 0% indicates that a Photovoltaic Supply + is not present in the property." + + When recommending solar, we want to simulate the retrofit by increasing this value from 0, so we need a sensible + figure to increase this to. This script will pull the data for that, to allow us to try and deduce what + a sensible figure would be + :return: + """ + + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + cleaned_lookup = get_cleaned() + + solar_data_client = SolarPhotoSupply( + file_directories=directories, + cleaned_lookup=cleaned_lookup + ) + + solar_data_client.create_dataset() + + solar_data_client.save() diff --git a/etl/solar/tests/test_solar_photo_supply.py b/etl/solar/tests/test_solar_photo_supply.py new file mode 100644 index 00000000..b9b7c09c --- /dev/null +++ b/etl/solar/tests/test_solar_photo_supply.py @@ -0,0 +1,109 @@ +import unittest +import pandas as pd +from etl.solar.SolarPhotoSupply import SolarPhotoSupply + + +class TestSolarPhotoSupply(unittest.TestCase): + + def setUp(self): + # Mock data for photo_supply_lookup and floor_area_decile_thresholds + self.photo_supply_lookup = pd.DataFrame({ + "tenure": ["leasehold", "freehold"], + "built_form": ["detached", "semi-detached"], + "property_type": ["house", "flat"], + "construction_age_band": ["pre-1900", "1900-1929"], + "is_flat": [False, True], + "is_pitched": [True, False], + "is_roof_room": [False, True], + "floor_area_decile": [0, 1], + "photo_supply": [100, 200] + }) + + self.floor_area_decile_thresholds = pd.DataFrame({ + "floor_area_decile_thresholds": [50, 100] + }) + + self.solar_photo_supply = SolarPhotoSupply([], {}) + + def test_correct_filtering(self): + result = self.solar_photo_supply.filter_photo_supply_lookup( + self.photo_supply_lookup, + self.floor_area_decile_thresholds, + "leasehold", + "detached", + "house", + "pre-1900", + False, + True, + False, + 45 + ) + self.assertEqual(len(result), 1) + self.assertEqual(result.iloc[0]["photo_supply"], 100) + + def test_no_matches(self): + with self.assertRaises(ValueError): + self.solar_photo_supply.filter_photo_supply_lookup( + self.photo_supply_lookup, + self.floor_area_decile_thresholds, + "leasehold", + "unknown", + "house", + "pre-1900", + False, + True, + False, + 45 + ) + + def test_floor_area_decile_matching(self): + result = self.solar_photo_supply.filter_photo_supply_lookup( + self.photo_supply_lookup, + self.floor_area_decile_thresholds, + "freehold", + "semi-detached", + "flat", + "1900-1929", + True, + False, + True, + 60 + ) + self.assertEqual(len(result), 1) + self.assertEqual(result.iloc[0]["photo_supply"], 200) + + def test_invalid_parameters(self): + with self.assertRaises(AttributeError): + self.solar_photo_supply.filter_photo_supply_lookup( + self.photo_supply_lookup, + self.floor_area_decile_thresholds, + 123, # Invalid type for tenure + "detached", + "house", + "pre-1900", + False, + True, + False, + 45 + ) + + def test_classify_floor_area(self): + # Setup + thresholds = [10, 20, 30, 40, 50] + solar_photo_supply = SolarPhotoSupply([], {}) + + # Test Case 1: Valid floor area + floor_area = 25 + expected_decile = 2 + result = solar_photo_supply.classify_floor_area(floor_area, thresholds) + self.assertEqual(result, expected_decile, "Decile classification did not match expected result") + + # Test Case 2: Out of range floor area + floor_area = 60 + expected_decile = len(thresholds) + result = solar_photo_supply.classify_floor_area(floor_area, thresholds) + self.assertEqual(result, expected_decile, "Decile classification for out of range value is incorrect") + + +if __name__ == '__main__': + unittest.main() diff --git a/etl/testing_data/estimate_epc.py b/etl/testing_data/estimate_epc.py new file mode 100644 index 00000000..cd91a540 --- /dev/null +++ b/etl/testing_data/estimate_epc.py @@ -0,0 +1,194 @@ +from pathlib import Path +from random import choices, sample + +import os +import pandas as pd +from tqdm import tqdm +from dotenv import load_dotenv +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc, vartypes +from BaseUtility import Definitions +from etl.epc.settings import BUILT_FORM_REMAP + +ENV_FILE = Path(__file__).parent / "backend" / ".env" + +logger = setup_logger() + +DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" +DIR_SAMPLE_SIZE = 500 +N_DIRECTORIES = 50 + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +load_dotenv(ENV_FILE) + +CATETORICALS_TO_IGNORE = [ + "postcode", "constituency", "local-authority", "built-form", "property-type", "address1", "constituency-label", + "building-reference-number", "address2", "posttown", "transaction-type", "lmk-key", "address3", + "local-authority-label", "county", +] + + +def check_numeric_performance(estimated_value, actual_value): + # If we don't have anything to compare against, return None + if pd.isnull(actual_value): + return None + + if pd.isnull(estimated_value): + return 1 + + if actual_value == 0 and estimated_value == 0: + return 0 + + if actual_value == 0 and estimated_value != 0: + return 1 + + return abs(estimated_value - actual_value) / actual_value + + +def app(): + """ + This script is used to test the EPC estimation process. + """ + + numerical_vartypes = {key: value for key, value in vartypes.items() if value in ["float", "Int64"]} + str_var_types = {key: value for key, value in vartypes.items() if value == "str"} + # Make sure we have missed any keys + if len(numerical_vartypes) + len(str_var_types) != len(vartypes): + raise ValueError("Not all vartypes have been accounted for") + + # Drop some keys that aren't important + for k in CATETORICALS_TO_IGNORE: + str_var_types.pop(k, None) + + directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + + directory_sample = choices(directories, k=N_DIRECTORIES) + + results = [] + + for directory in tqdm(directory_sample): + filepath = directory / "certificates.csv" + df = pd.read_csv(filepath, low_memory=False) + df["UPRN"] = df["UPRN"].astype("Int64").astype("str") + df = df[~pd.isnull(df["UPRN"])] + + # uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE) + # Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns + uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE] + df_sample = df[df["UPRN"].isin(uprn_sample)] + # Take the record with the newest LODGEMENT_DATETIME by uprn + df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN") + # Convert the columns to lower case and replace underscores with hyphens, the same as the api + df_sample.columns = df_sample.columns.str.lower().str.replace("_", "-") + + # For each epc, we test the estimation process + for _, epc in df_sample.iterrows(): + epc = epc.to_dict() + address1 = epc["address1"] + postcode = epc["postcode"] + + # Get all EPCs for this urpn and we make sure they get dropped from the estimate_epc function + epcs_for_uprn = df[df["UPRN"] == epc["uprn"]] + lmks_to_drop = epcs_for_uprn["LMK_KEY"].tolist() + searcher = SearchEpc(address1, postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="") + searcher.uprn = epc["uprn"] + + # Perform the same remapping for built-form as in the Property class for this test, in case we get (e.g.) + # Enclosed End-Terrace + built_form = BUILT_FORM_REMAP.get(epc["built-form"], epc["built-form"]) + if ((epc["property-type"] == "Maisonette") & (built_form == "Detached")) or ( + built_form in Definitions.DATA_ANOMALY_MATCHES + ): + built_form = "" + + estimated_epc = searcher.estimate_epc( + property_type=epc["property-type"], built_form=built_form, lmks_to_drop=lmks_to_drop + ) + + # We now compare the difference between the estimated and original + # TODO: We can convert windows and lighting to numeric versions and estimate how close we are + numeric_performance = { + key: check_numeric_performance(estimated_epc[key], epc[key]) for key, value in + numerical_vartypes.items() + } + + # Remove Nones + numeric_performance = {key: value for key, value in numeric_performance.items() if value is not None} + # Get an average + numeric_performance = sum(numeric_performance.values()) / len(numeric_performance) + numeric_success = 1 - numeric_performance + + # categorical performance + categorical_performance = { + key: 0 if estimated_epc[key] != epc[key] else 1 for key, value in str_var_types.items() + } + # Get an average + categorical_success = sum(categorical_performance.values()) / len(categorical_performance) + + results.append( + { + "uprn": epc["uprn"], + "numeric_success": numeric_success, + "categorical_success": categorical_success, + "property_type": epc["property-type"], + "built_form": epc["built-form"], + "tenure": epc["tenure"], + } + ) + + # Get aggregate performance figures + results_df = pd.DataFrame(results) + results_df["tenure"] = results_df["tenure"].replace("Rented (social)", "rental (social)") + + avg_numeric_succes = results_df["numeric_success"].median() + avg_categorical_sucess = results_df["categorical_success"].median() + + # With 20 nearest homes + # 0.7718100840549558 + # 0.5116279069767442 + # 100 nearest homes + # 0.7859617377809409 + # 0.5348837209302325 + + # Fixed sample, sqrt weights + + # Group by tenure + by_tenure = results_df.groupby("tenure").agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + ) + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + # With 20 nearest homes + # numeric_success categorical_success uprn + # tenure + # NO DATA! 0.847840 0.581395 278 + # Not defined - use in the case of a new dwelling... 0.930282 0.651163 617 + # Owner-occupied 0.770330 0.511628 2588 + # Rented (private) 0.791885 0.558140 1232 + # owner-occupied 0.741088 0.488372 10912 + # rental (private) 0.749064 0.488372 3252 + # rental (social) 0.822109 0.581395 3878 + # unknown 0.895840 0.627907 1820 + + # 100 nearest homes + # tenure + # NO DATA! 0.899566 0.604651 233 + # Not defined - use in the case of a new dwelling... 0.927518 0.674419 608 + # Owner-occupied 0.777026 0.511628 3167 + # Rented (private) 0.805646 0.534884 1316 + # owner-occupied 0.762180 0.488372 10835 + # rental (private) 0.760503 0.511628 3181 + # rental (social) 0.830057 0.604651 3705 + # unknown 0.899948 0.627907 1571 + + # By property type - we also want to see how many properties we have for each property type + by_property_type = results_df.groupby("property_type").agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + ) + # By property_type & built form + by_property_type_built_form = results_df.groupby(["property_type", "built_form"]).agg( + {"numeric_success": "median", "categorical_success": "median", "uprn": "count"} + ) diff --git a/etl/testing_data/no_epc_input.py b/etl/testing_data/no_epc_input.py new file mode 100644 index 00000000..0745ff7a --- /dev/null +++ b/etl/testing_data/no_epc_input.py @@ -0,0 +1,42 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import pandas as pd +from utils.s3 import save_csv_to_s3 + +USER_ID = 8 +PORTFOLIO_ID = 57 + + +def app(): + """ + This portfolio is for testing windows recommendations + :return: + """ + + test_file = pd.DataFrame( + [ + {"address": "21 Butler House", "postcode": "E2 0PN", "Notes": None}, + {"address": "22 Butler House", "postcode": "E2 0PN", "Notes": None}, + {"address": "23 Butler House", "postcode": "E2 0PN", "Notes": None}, + {"address": "24 Butler House", "postcode": "E2 0PN", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/no_epc.csv" + save_csv_to_s3( + dataframe=test_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "A", + "trigger_file_path": filename + } + print(body) diff --git a/etl/testing_data/windows_portfolio.py b/etl/testing_data/windows_portfolio.py new file mode 100644 index 00000000..356d107e --- /dev/null +++ b/etl/testing_data/windows_portfolio.py @@ -0,0 +1,43 @@ +""" +This script will create an input csv for the recommendation engine and upload it to S3, which can be used for +testing +""" +import pandas as pd +from utils.s3 import save_csv_to_s3 + +USER_ID = 8 +PORTFOLIO_ID = 56 + + +def app(): + """ + This portfolio is for testing windows recommendations + :return: + """ + + test_file = pd.DataFrame( + [ + {"address": "3 Church Terrace", "postcode": "LE13 0PW", "Notes": None}, + {"address": "3, Main Street, Redmile", "postcode": "NG13 0GA", "Notes": None}, + {"address": "Manor House, Kennel Lane, Reepham", "postcode": "LN3 4DZ", "Notes": None}, + {"address": "13 Main Street", "postcode": "LE14 2JU", "Notes": None}, + {"address": "8 The Crescent, Coston Road, Buckminster", "postcode": "NG33 5SF", "Notes": None}, + ] + ) + + # Store the data in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/windows_portfolio_inputs.csv" + save_csv_to_s3( + dataframe=test_file, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increase EPC", + "goal_value": "A", + "trigger_file_path": filename + } + print(body) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 0d9031b2..106f4453 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -18,6 +18,25 @@ regional_labour_variations = [ {"Region": "Northern Ireland", "Adjustment_Factor": 0.76} ] +# This data is based on the MCS database +MCS_SOLAR_PV_COST_DATA = { + "last_updated": "2024-01-04", + "average_cost_per_kwh": 2013.94, + "average_cost_per_kwh-Outer London": 2618.75, + "average_cost_per_kwh-Inner London": 2618.75, + "average_cost_per_kwh-South East England": 2083.33, + "average_cost_per_kwh-South West England": 2113, + "average_cost_per_kwh-East of England": 1973.86, + "average_cost_per_kwh-East Midlands": 1981.86, + "average_cost_per_kwh-West Midlands": 1926.55, + "average_cost_per_kwh-North East England": 2028.49, + "average_cost_per_kwh-North West England": 1620.42, + "average_cost_per_kwh-Yorkshire and the Humber": 2060.9, + "average_cost_per_kwh-Wales": 1898.83, + "average_cost_per_kwh-Scotland": 1967.97, + "average_cost_per_kwh-Northern Ireland": 2126.09, +} + class Costs: """ @@ -42,7 +61,7 @@ class Costs: # We use a higher contingency rate for internal wall insulation because of the potential for issues with moving # fittings and trimming doors, as well as scope for damage to the existing wall during preparation. - IWI_CONTINGENCY = 0.15 + IWI_CONTINGENCY = 0.2 # Where there is more uncertainty, a higher contingency rate is used HIGH_RISK_CONTINGENCY = 0.2 @@ -58,12 +77,22 @@ class Costs: # have a preliminaries of 12-14% so we use 12% as the median for the preliminaries rate. # For External wall insulation (EWI), we use 15% as the preliminaries rate if we think the property might # need scaffolding, otherwise we use 12%. This is to account for any site preparation that might be required - EWI_NO_SCAFFOLDING_PRELIMINARIES = 0.15 - EWI_SCAFFOLDING_PRELIMINARIES = 0.20 + EWI_NO_SCAFFOLDING_PRELIMINARIES = 0.2 + EWI_SCAFFOLDING_PRELIMINARIES = 0.25 VAT_RATE = 0.2 PROFIT_MARGIN = 0.2 + # Based on this greenmatch article, on average, a Sash window is around 50% more expensive than a casement window. + # Therefore, for a conservative cost estimate, and allowance for a more premium window type, we inflate the material + # cost of the windows to allow for a sash window type + # https://www.greenmatch.co.uk/windows/double-glazing/cost + SASH_WINDOW_INFLATION_FACTOR = 1.5 + + # Typically, secondary glazing can be installed for 25% of the cost of double glazed windows - to be conservative, + # we scale the cost by half + SECONDARY_GLAZING_SCALING_FACTOR = 0.5 + def __init__(self, property_instance): """ Initializes the Costs class with a property instance. @@ -147,12 +176,16 @@ class Costs: """ material_cost_per_m2 = material["material_cost"] + # We inflate material costs due to recent price increases + material_cost_per_m2 = material_cost_per_m2 * 1.5 + base_material_cost = material_cost_per_m2 * floor_area labour_cost = material["labour_cost"] * floor_area * self.labour_adjustment_factor subtotal_before_profit = base_material_cost + labour_cost - contingency_cost = subtotal_before_profit * self.CONTINGENCY + # We use high risk contingency because of the possibility of access issues and clearing existing insulation + contingency_cost = subtotal_before_profit * self.HIGH_RISK_CONTINGENCY preliminaries_cost = subtotal_before_profit * self.PRELIMINARIES profit_cost = subtotal_before_profit * self.PROFIT_MARGIN @@ -719,3 +752,121 @@ class Costs: "labour_days": labour_days, "labour_cost": labour_costs } + + def window_glazing(self, number_of_windows, material, is_secondary_glazing=False): + """ + We characterise the jobs to be done for window glazing as the following: + 1) Initial Assessment and Measurements: Before removing the existing window, it's essential to assess the + condition of the window frame and opening. Precise measurements are taken to ensure the new double glazed + windows fit perfectly. + + 2) Remove the Existing Window: This involves carefully dismantling and removing the old single glazed window. It + requires skill to avoid damaging the surrounding wall and the window frame (if it's to be reused). + + 3) Dispose of the Existing Window: The old window, especially if it's a single glazed unit, needs to be + disposed of responsibly. Glass and other materials should be recycled where possible. + + 4) Surface Preparation: The window opening might need some preparation, especially if there's damage or if + adjustments are needed to accommodate the new window. This can include repairing or replacing parts of the + window frame, sealing gaps, and ensuring the opening is level and square. + + 5) Install the Window Frame (if new frames are used): In many cases, double glazed windows come with their + frames. These need to be installed securely into the window opening. This process involves aligning, leveling, + and fixing the frame in place. + + 6) Install the Window Sill: If a new window sill is required, it is installed at this stage. It needs to be + correctly aligned with the frame and securely attached. + + 7) Install the Double Glazed Glass Units: The glass units are carefully inserted into the frame. This step + requires precision to ensure a snug fit without causing stress on the glass, which could lead to cracking or + breaking. + + 8) Sealing and Weatherproofing: After the glass units are in place, it's crucial to seal around the frame and + between the glass and frame to ensure there are no drafts and that the installation is weather-tight. This + typically involves applying silicone sealant or other appropriate sealing materials. + + 9) Finishing Touches: This includes any cosmetic work, such as trimming, painting, or staining the frame and + sill to match the rest of the property. It might also involve cleaning up any mess created during the + installation. + + 10) Inspection and Testing: Finally, the new windows should be inspected to ensure they open, close, and lock + correctly. This is also a good time to check for any gaps or issues with the sealing. + + For this cost estimation process, we factor in initial assement into the preliminaries + + """ + + material_cost = material["material_cost"] * number_of_windows + + labour_cost = ( + material["labour_cost"] * number_of_windows * self.labour_adjustment_factor + ) + multiplier = self.SECONDARY_GLAZING_SCALING_FACTOR if is_secondary_glazing else ( + self.SASH_WINDOW_INFLATION_FACTOR) + + subtotal = (material_cost + labour_cost) * multiplier + + contingency_cost = subtotal * self.CONTINGENCY + preliminaries_cost = subtotal * self.PRELIMINARIES + profit_cost = subtotal * self.PROFIT_MARGIN + + subtotal_before_vat = subtotal + contingency_cost + preliminaries_cost + profit_cost + + vat_cost = subtotal_before_vat * self.VAT_RATE + + total_cost = subtotal_before_vat + vat_cost + + labour_hours = material["labour_hours_per_unit"] * number_of_windows + labour_hours = labour_hours * self.SECONDARY_GLAZING_SCALING_FACTOR if is_secondary_glazing else labour_hours + + # Assume a team of 2 + labour_days = (labour_hours / 8) / 2 + + return { + "total": total_cost, + "subtotal": subtotal_before_vat, + "vat": vat_cost, + "contingency": contingency_cost, + "preliminaries": preliminaries_cost, + "material": material_cost, + "profit": profit_cost, + "labour_hours": labour_hours, + "labour_cost": labour_cost, + "labour_days": labour_days + } + + def solar_pv(self, wattage: float): + + """ + Calculates the total cost for solar PV based data provided by the MCS dashboard, which contains + costing data for installations of renewable and clean energy measures. + + The data in the dashboard is filtered on domestic building installations and then the data across the + various regions is manually collected. There is currently no automated way to get the data from the MCS + dashboard + + Price can also be benchmarked against this checkatrade article: + https://www.checkatrade.com/blog/cost-guides/cost-of-solar-panel-installation/ + :param wattage: Peak wattage of the solar PV system + :return: + """ + + # Get the cost data relevant to the region + regional_cost = MCS_SOLAR_PV_COST_DATA["-".join(["average_cost_per_kwh", self.region])] + + kw = wattage / 1000 + total_cost = kw * regional_cost + + subtotal_before_vat = total_cost / (1 + self.VAT_RATE) + vat = total_cost - subtotal_before_vat + + # Labour hours are based on estimates from online research but an average team seems to consist of 3 people + # and most jobs take around 2 days. Assuming an 8 hour day for 3 people across 2 days, gives us 72 hours of + # labour + return { + "total": total_cost, + "subtotal": subtotal_before_vat, + "vat": vat, + "labour_hours": 72, + "labour_days": 2, + } diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 60cdb696..2b35ffea 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -6,6 +6,8 @@ from recommendations.RoofRecommendations import RoofRecommendations from recommendations.VentilationRecommendations import VentilationRecommendations from recommendations.FireplaceRecommendations import FireplaceRecommendations from recommendations.LightingRecommendations import LightingRecommendations +from recommendations.SolarPvRecommendations import SolarPvRecommendations +from recommendations.WindowsRecommendations import WindowsRecommendations from backend.ml_models.AnnualBillSavings import AnnualBillSavings @@ -35,6 +37,8 @@ class Recommendations: ) self.fireplace_recommender = FireplaceRecommendations(property_instance=property_instance) self.lighting_recommender = LightingRecommendations(property_instance=property_instance, materials=materials) + self.windows_recommender = WindowsRecommendations(property_instance=property_instance, materials=materials) + self.solar_recommender = SolarPvRecommendations(property_instance=property_instance) def recommend(self): @@ -77,6 +81,16 @@ class Recommendations: if self.lighting_recommender.recommendation: property_recommendations.append(self.lighting_recommender.recommendation) + # Windows recommendations + self.windows_recommender.recommend() + if self.windows_recommender.recommendation: + property_recommendations.append(self.windows_recommender.recommendation) + + # Solar recommendations + self.solar_recommender.recommend() + if self.solar_recommender.recommendation: + property_recommendations.append(self.solar_recommender.recommendation) + # We insert temporary ids into the recommendations which is important for the optimiser later property_recommendations = self.insert_temp_recommendation_id(property_recommendations) @@ -148,6 +162,8 @@ class Recommendations: # For the moment, we cap the number of SAP points that can be achieved by ventilation at 2 rec["sap_points"] = min(rec["sap_points"], VentilationRecommendations.SAP_LIMIT) + # Round to 2 decimal places + rec["sap_points"] = round(rec["sap_points"], 2) rec["co2_equivalent_savings"] = float(property_instance.data["co2-emissions-current"]) - new_carbon # Energy consumption current is per meter squared, so we need to multiply by the floor area to get diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py new file mode 100644 index 00000000..01cd4f17 --- /dev/null +++ b/recommendations/SolarPvRecommendations.py @@ -0,0 +1,65 @@ +import numpy as np +from recommendations.Costs import Costs + + +class SolarPvRecommendations: + # Approximate area of the solar panels + SOLAR_PANEL_AREA = 1.6 + # Wattage per panel + SOLAR_PANEL_WATTAGE = 360 + + def __init__(self, property_instance): + """ + :param property_instance: Instance of the Property class, for the home associated to property_id + """ + + self.property = property_instance + self.costs = Costs(self.property) + + self.recommendation = [] + + def recommend(self): + """ + We check if a property is potentially suitable for solar PV based on the following criteria: + - The property is a house or bungalow + - The property has a flat or pitched roof + - The property does not have existing solar pv + :return: + """ + + is_valid_property_type = self.property.data["property-type"] in ["House", "Bungalow"] + is_valid_roof_type = ( + self.property.roof["is_flat"] or self.property.roof["is_pitched"] or self.property.roof["is_roof_room"] + ) + # If there is no existing solar PV, the photo-supply field will be None or a missing value + has_no_existing_solar_pv = self.property.data["photo-supply"] in [ + None, 0, self.property.DATA_ANOMALY_MATCHES + ] + + if not is_valid_property_type or not is_valid_roof_type or not has_no_existing_solar_pv: + return + + # We now have a property which is potentially suitable for solar PV + number_solar_panels = np.floor(self.property.solar_pv_roof_area / self.SOLAR_PANEL_AREA) + solar_panel_wattage = number_solar_panels * self.SOLAR_PANEL_WATTAGE + + # Given the wattage, we estimate the cost of the solar PV system. This is based on the MCS database + # of solar PV installations + cost_result = self.costs.solar_pv(wattage=solar_panel_wattage) + + kw = int(np.round(solar_panel_wattage / 1000)) + + self.recommendation = [ + { + "parts": [], + "type": "solar_pv", + "description": f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) panel system on the roof", + "starting_u_value": None, + "new_u_value": None, + "sap_points": None, + **cost_result, + # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we scale + # back up here + "photo_supply": 100 * self.property.solar_pv_percentage + } + ] diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py new file mode 100644 index 00000000..b6ecd099 --- /dev/null +++ b/recommendations/WindowsRecommendations.py @@ -0,0 +1,97 @@ +from typing import List + +import numpy as np + +from backend.Property import Property +from recommendations.Costs import Costs + + +class WindowsRecommendations: + # If the property has existing glazing, we scale down the number of windows that need to be glazed + COVERAGE_MAP = { + # If most of the windows have already been glazed, we assume that 2/3 are glazed and 1/2 are remaining to be + # glazed + "most": 0.33, + # If glazing is partial, we assume 50/50 split between glazed and unglazed + "partial": 0.5 + } + + def __init__(self, property_instance: Property, materials: List): + self.property = property_instance + self.costs = Costs(self.property) + + self.recommendation = [] + + self.glazing_material = [ + material for material in materials if material["type"] == "windows_glazing" + ] + + if len(self.glazing_material) != 1: + raise ValueError("There should only be one window glazing material") + self.glazing_material = self.glazing_material[0] + + def recommend(self): + """ + This method will recommend the best possible glazing options for a property. + + In order to do this, we need to estimate the number of windows that the home has. This information will be + stored in the property object, under property.number_of_windows + :return: + """ + + # If the property is in a conservation area or is a listed building, it becomes more difficult to install + # double glazing. Therefore, we don't recommend it. It is still possible but is not practical as it + # requires planning permission and might require a more expensive window type, such as timber. + + number_of_windows = self.property.number_of_windows + is_secondary_glazing = self.property.restricted_measures or ( + self.property.windows["glazing_type"] == "secondary" + ) + + if not number_of_windows: + raise ValueError("Number of windows not specified") + + if self.property.windows["has_glazing"] & (self.property.windows["glazing_coverage"] == "full"): + return + + # We scale the number of windows based on the proportion of existing glazing + if self.property.data["multi-glaze-proportion"] != "": + n_windows_scalar = 1 - (int(self.property.data["multi-glaze-proportion"]) / 100) + else: + n_windows_scalar = self.COVERAGE_MAP.get(self.property.windows["glazing_coverage"], 1) + + number_of_windows *= n_windows_scalar + number_of_windows = np.ceil(number_of_windows) + + # We then price the job based on the number of windows that there are + cost_result = self.costs.window_glazing( + number_of_windows=number_of_windows, + material=self.glazing_material, + is_secondary_glazing=is_secondary_glazing + ) + + glazing_type = "secondary glazing" if is_secondary_glazing else "double glazing" + if self.property.windows["glazing_coverage"] in ["partial", "most"]: + description = f"Install {glazing_type} to the remaining windows" + else: + description = f"Install {glazing_type} to all windows" + + if self.property.is_listed: + description += ". Secondary glazing recommended due to listed building status" + elif self.property.is_heritage: + description += ". Secondary glazing recommended due to herigate building status" + elif self.property.in_conservation_area: + description += ". Secondary glazing recommended due to conservation area status" + + self.recommendation = [ + { + "parts": [], + "type": "windows_glazing", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": None, + **cost_result, + "is_secondary_glazing": is_secondary_glazing + } + ] diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 01ef8480..872a1c5b 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -1,4 +1,5 @@ import math +from datetime import datetime from copy import deepcopy from typing import Union @@ -565,7 +566,7 @@ def estimate_external_wall_area(num_floors, floor_height, perimeter, built_form) 'Detached': 4, } - exposed_wall_area = total_wall_area * (number_exposed_walls[built_form] / 4) + exposed_wall_area = total_wall_area * (number_exposed_walls.get(built_form, 3) / 4) return exposed_wall_area @@ -669,3 +670,87 @@ def esimtate_pitched_roof_area(floor_area: float, floor_height: float) -> float: area = 2 * (slope * wall_width) return area + + +def estimate_windows( + property_type, built_form, construction_age_band, floor_area, number_habitable_rooms, extension_count +): + # Base window count based on habitable rooms + window_count = number_habitable_rooms + + # Additional windows for non-habitable rooms (e.g., kitchen, bathroom) + # Assuming most houses will have at least one kitchen and one bathroom + # Scale non-habitable windows with the number of habitable rooms + non_habitable_base = 2 # Base for kitchen and bathroom + extra_non_habitable = max(0, (number_habitable_rooms - 3) // 2) # Extra for large houses + window_count += non_habitable_base + extra_non_habitable + + # Adjustments based on built form and property type + if property_type in ["House", "Bungalow"] and built_form in ["Semi-Detached", "Detached"]: + built_form_lookup = { + "Semi-Detached": 3, + "Detached": 4, + } + else: + # For Flats and Maisonettes, adjustments might be less + built_form_lookup = { + "Mid-Terrace": 0, + "End-Terrace": 1, + "Semi-Detached": 1, + "Detached": 2, + } + window_count += built_form_lookup.get(built_form, 0) + + # Adjust for floor area (larger floor area might indicate more rooms/windows) + if floor_area < 85: # Small to medium properties + # Standard window count likely sufficient + pass + elif 85 <= floor_area <= 120: # Medium to large properties + # More rooms or larger rooms likely, potentially more windows + window_count += 1 + elif floor_area > 120: # Very large properties + # Likely to have significantly more or larger rooms + window_count += 2 + + # Adjust for construction age band + if construction_age_band in ["England and Wales: before 1900", "England and Wales: 1900-1929"]: + # Older houses with smaller, more numerous windows + window_count += 1 + + # Adjust for extensions (each extension might add windows) + window_count += extension_count + + # Adjustments for specific property types + if property_type in ["Flat", "Maisontte"]: + # Flats might have fewer windows due to shared walls + # Maisonettes might follow a similar pattern to flats or small houses + window_count -= 1 + + # Ensure window count is not negative + if window_count < 0: + raise ValueError("Window count cannot be negative.") + + return window_count + + +def calculate_cavity_age(newest_epc, older_epcs, cleaned): + all_epcs = [newest_epc] + older_epcs + + df = [] + for x in all_epcs: + # Get the cleaned mapping + mapped = [y for y in cleaned["walls-description"] if y["original_description"] == x["walls-description"]] + if not mapped: + continue + df.append( + { + **mapped[0], + "inspection-date": x["lodgement-date"], + } + ) + + df = pd.DataFrame(df) + df = df[df["is_cavity_wall"] & df["is_filled_cavity"]] + + cavity_age = (datetime.now() - pd.to_datetime(df["inspection-date"].max())).days + return cavity_age diff --git a/recommendations/tests/test_costs.py b/recommendations/tests/test_costs.py index 1d519b91..402e38eb 100644 --- a/recommendations/tests/test_costs.py +++ b/recommendations/tests/test_costs.py @@ -1,6 +1,7 @@ from recommendations.Costs import Costs from unittest.mock import Mock import datetime +import pytest class TestCosts: @@ -58,9 +59,9 @@ class TestCosts: ) assert loft_results == { - 'total': 430.21445040000003, 'subtotal': 358.512042, 'vat': 71.70240840000001, - 'contingency': 25.608003000000004, 'preliminaries': 25.608003000000004, 'material': 198.29923000000002, - 'profit': 51.21600600000001, 'labour_hours': 3.685, 'labour_cost': 57.7808, 'labour_days': 0.460625 + 'total': 639.4133610000001, 'subtotal': 532.8444675000001, 'vat': 106.56889350000002, + 'contingency': 71.045929, 'preliminaries': 35.5229645, 'material': 297.448845, 'profit': 71.045929, + 'labour_hours': 3.685, 'labour_cost': 57.7808, 'labour_days': 0.460625 } def test_internal_wall_insulation(self): @@ -176,11 +177,9 @@ class TestCosts: ) assert iwi_results == { - 'total': 6650.889456921851, 'subtotal': 5542.407880768209, 'vat': 1108.4815761536418, - 'contingency': 573.3525393898148, 'preliminaries': 382.2350262598765, - 'material': 1747.488000615996, - 'profit': 764.470052519753, 'labour_hours': 88.23759388401297, - 'labour_days': 2.757424808875405, + 'total': 6880.2304726777775, 'subtotal': 5733.525393898148, 'vat': 1146.7050787796295, + 'contingency': 764.470052519753, 'preliminaries': 382.2350262598765, 'material': 1747.488000615996, + 'profit': 764.470052519753, 'labour_hours': 88.23759388401297, 'labour_days': 2.757424808875405, 'labour_cost': 1927.1602026551818 } @@ -414,8 +413,8 @@ class TestCosts: ) assert ewi_results == { - 'total': 14561.688989159393, 'subtotal': 12134.740824299493, 'vat': 2426.948164859899, - 'contingency': 808.9827216199662, 'preliminaries': 1617.9654432399325, 'material': 4020.565147410677, + 'total': 15047.078622131372, 'subtotal': 12539.232185109477, 'vat': 2507.8464370218953, + 'contingency': 808.9827216199662, 'preliminaries': 2022.4568040499155, 'material': 4020.565147410677, 'profit': 1617.9654432399325, 'labour_hours': 187.02533486285358, 'labour_days': 5.8445417144641745, 'labour_cost': 3921.5600094613983 } @@ -499,3 +498,48 @@ class TestCosts: 'labour_hours': 24.79, 'labour_days': 1.549375, 'labour_cost': 186.9032} assert costs.labour_adjustment_factor == 0.88 + + # Mock property instance for regional tests + @pytest.fixture(params=[ + ("Northamptonshire", "East Midlands", 7927.44), + ("Greater London Authority", "Inner London", 10475.0), + ("Adur", "South East England", 8333.32), + ("Bournemouth", "South West England", 8452), + ("Basildon", "East of England", 7895.44), + ("Birmingham", "West Midlands", 7706.2), + ("County Durham", "North East England", 8113.96), + ("Allerdale", "North West England", 6481.68), + ("York", "Yorkshire and the Humber", 8243.6), + ("Cardiff", "Wales", 7595.32), + ("Glasgow City", "Scotland", 7871.88), + ("Belfast", "Northern Ireland", 8504.36) + ]) + def mock_property_with_region(self, request): + county, region, expected_cost = request.param + mock_property = Mock() + mock_property.data = {"county": county} + return mock_property, region, expected_cost + + # Test for different wattages + @pytest.mark.parametrize("wattage, expected_cost", [ + (3000, 5945.58), + (4000, 7927.44), + (5000, 9909.3), + (6000, 11891.16), + ]) + def test_solar_pv_different_wattages(self, wattage, expected_cost): + mock_property = Mock() + mock_property.data = {"county": "Mansfield"} + costs = Costs(mock_property) + result = costs.solar_pv(wattage) + assert result['total'] == pytest.approx(expected_cost, rel=0.01) + + def test_solar_pv_regional_variation(self, mock_property_with_region): + # Test for regional cost variations + property_instance, expected_region, expected_cost = mock_property_with_region + costs = Costs(property_instance) + + assert costs.region == expected_region + + result = costs.solar_pv(4000) # Testing with a fixed wattage of 4000 + assert result['total'] == pytest.approx(expected_cost, rel=0.01) diff --git a/recommendations/tests/test_data/materials.py b/recommendations/tests/test_data/materials.py index d7241be5..187d1401 100644 --- a/recommendations/tests/test_data/materials.py +++ b/recommendations/tests/test_data/materials.py @@ -942,8 +942,24 @@ materials = [ 'https://www.hamuch.com/cost/led-spot-light#:~:text=It%20costs%20an%20average%20of,' 'will%20drive%20up%20the%20cost.', 'created_at': datetime.datetime(2023, 11, 28, 22, 49, 12, 244907), 'is_active': True, 'prime_material_cost': None, - 'material_cost': 20.0, 'labour_cost': 46.0, 'labour_hours_per_unit': 0.8, 'plant_cost': 0.0, 'total_cost': 66.0, + 'material_cost': 20.0, 'labour_cost': 15.0, 'labour_hours_per_unit': 0.8, 'plant_cost': 0.0, 'total_cost': 66.0, 'notes': 'We estimate the unit economics from the checkatrade article. We assume that the average job consists ' 'of installing 6 lights based on the hamuch article. We use the median value of 400 for a job of 6 ' - 'lights'} + 'lights'}, + {'id': 1235, 'type': 'windows_glazing', + 'description': 'uPVC windows; Profile 22 or other equal and approved; reinforced where appropriate with ' + 'aluminium alloy; in refurbishment work, including standard ironmongery; sills and factory glazed ' + 'with low-e 24 mm double glazing; removing existing windows and fixing new in position; including ' + 'lugs plugged and screwed to brickwork or blockwork; Casement/fixed light; including vents; ' + 'e.p.d.m. glazing gaskets and weather seals; 1770 mm × 1200 mm; ref P312WW', + 'depth': 0.0, 'depth_unit': None, 'cost': None, 'cost_unit': 'gbp_per_unit', 'r_value_per_mm': None, + 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': None, 'thermal_conductivity_unit': None, + 'link': 'SPONs', + 'created_at': datetime.datetime(2023, 11, 28, 22, 49, 12, 244907), + 'is_active': True, 'prime_material_cost': 176.55, + 'material_cost': 182.25, 'labour_cost': 163.36, 'labour_hours_per_unit': 6.5, 'plant_cost': 0.0, + 'total_cost': 345.61, + 'notes': 'This is the cost of removal of existing windows and installation of new windows. This is a casement ' + 'style window, which is the most common but also the cheapest style. In the cost estimation framework, ' + 'we can inflate prices for different finishes, to be conservative on price.'} ] diff --git a/recommendations/tests/test_fireplace_recommendations.py b/recommendations/tests/test_fireplace_recommendations.py index 570fbb5c..a91d6697 100644 --- a/recommendations/tests/test_fireplace_recommendations.py +++ b/recommendations/tests/test_fireplace_recommendations.py @@ -6,7 +6,7 @@ from recommendations.FireplaceRecommendations import FireplaceRecommendations class TestFirepaceRecommendations: def test_no_fireplaces(self): - property_instance = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance = Property(id=0, address="fake", postcode="fake") property_instance.data = { "number-open-fireplaces": 0 } @@ -22,7 +22,7 @@ class TestFirepaceRecommendations: assert recommender.recommendation is None def test_one_fireplace(self): - property_instance = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance = Property(id=0, address="fake", postcode="fake") property_instance.data = { "number-open-fireplaces": 1 } @@ -40,7 +40,7 @@ class TestFirepaceRecommendations: assert recommender.recommendation[0]["total"] == 300 def test_multiple_fireplaces(self): - property_instance = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance = Property(id=0, address="fake", postcode="fake") property_instance.data = { "number-open-fireplaces": 3 } diff --git a/recommendations/tests/test_floor_recommendations.py b/recommendations/tests/test_floor_recommendations.py index 43e98d60..555f9a27 100644 --- a/recommendations/tests/test_floor_recommendations.py +++ b/recommendations/tests/test_floor_recommendations.py @@ -21,16 +21,6 @@ class TestFloorRecommendations: ) as f: return pickle.load(f) - @pytest.fixture - def mock_floor_rec_instance(self): - # Creating a mock instance of WallRecommendations with the necessary attributes - property_mock = Mock() - property_mock.full_sap_epc = {"lodgement-date": "2000-01-01"} - property_mock.data = {"county": "York"} - - mock_wall_rec_instance = FloorRecommendations(property_mock, materials) - return mock_wall_rec_instance - def test_init(self, input_properties): input_properties[0].insulation_floor_area = 50 input_properties[0].insulation_wall_area = 90 @@ -68,6 +58,7 @@ class TestFloorRecommendations: input_properties[2].wall_type = "solid brick" input_properties[2].floor_type = "suspended" input_properties[2].number_of_floors = 1 + input_properties[2].floor_level = 0 recommender = FloorRecommendations(property_instance=input_properties[2], materials=materials) assert recommender.estimated_u_value is None @@ -93,6 +84,8 @@ class TestFloorRecommendations: input_properties[3].insulation_floor_area = 100 input_properties[3].insulation_wall_area = 100 input_properties[3].number_of_floors = 1 + input_properties[3].floor_level = 0 + recommender = FloorRecommendations(property_instance=input_properties[3], materials=materials) assert recommender.estimated_u_value is None recommender.recommend() @@ -114,6 +107,7 @@ class TestFloorRecommendations: input_properties[4].wall_type = "solid brick" input_properties[4].floor_type = "solid" input_properties[4].number_of_floors = 1 + input_properties[4].floor_level = 0 # In this case, we have no county, so in this case, it should yse the local-authority-label if possible input_properties[4].data["county"] = "" diff --git a/recommendations/tests/test_lighting_recommendations.py b/recommendations/tests/test_lighting_recommendations.py index 06d1163f..964f1da0 100644 --- a/recommendations/tests/test_lighting_recommendations.py +++ b/recommendations/tests/test_lighting_recommendations.py @@ -9,7 +9,7 @@ from recommendations.tests.test_data.materials import materials class TestLightingRecommendations: def test_init_invalid_materials(self): - input_property0 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property0 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property0.lighting = {"low_energy_proportion": 0} input_property0.data = {"county": "Greater London Authority"} # Test for invalid materials @@ -18,7 +18,7 @@ class TestLightingRecommendations: def test_recommend_no_action_needed(self): # Case where no recommendation is needed - input_property1 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property1.lighting = {"low_energy_proportion": 100} input_property1.data = {"county": "Greater London Authority"} @@ -28,7 +28,7 @@ class TestLightingRecommendations: def test_recommend_action_needed(self): # Case where recommendation is needed - input_property1 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property1.lighting = {"low_energy_proportion": 100} input_property1.data = {"county": "Greater London Authority"} input_property1.lighting = {"low_energy_proportion": 0.80} @@ -40,8 +40,7 @@ class TestLightingRecommendations: assert lr.recommendation == [ {'parts': [], 'type': 'low_energy_lighting', 'description': 'Install low energy lighting in 4 outlets', - 'starting_u_value': None, 'new_u_value': None, 'sap_points': 0.4, 'total': 458.976, 'subtotal': 382.48, - 'vat': 76.49600000000001, 'contingency': 27.320000000000007, 'preliminaries': 27.320000000000007, - 'material': 80.0, 'profit': 54.640000000000015, 'labour_hours': 3.2, 'labour_days': 0.4, - 'labour_cost': 193.20000000000002} + 'starting_u_value': None, 'new_u_value': None, 'sap_points': 0.4, 'total': 240.24, + 'subtotal': 200.20000000000002, 'vat': 40.040000000000006, 'contingency': 14.3, 'preliminaries': 14.3, + 'material': 80.0, 'profit': 28.6, 'labour_hours': 3.2, 'labour_days': 0.4, 'labour_cost': 63.0} ] diff --git a/recommendations/tests/test_recommendation_utils.py b/recommendations/tests/test_recommendation_utils.py index aefc70b0..559a51b2 100644 --- a/recommendations/tests/test_recommendation_utils.py +++ b/recommendations/tests/test_recommendation_utils.py @@ -427,3 +427,106 @@ def test_external_wall_area(): for num_floors, floor_height, perimeter, built_form, expected in test_cases: result = recommendation_utils.estimate_external_wall_area(num_floors, floor_height, perimeter, built_form) assert result == expected, f"Test failed for {built_form}: Expected {expected}, got {result}" + + +def test_estimate_windows(): + # Based on data from an EPR that has 4 windows + windows_case_1 = recommendation_utils.estimate_windows( + property_type="Flat", + built_form="Semi-Detached", + construction_age_band="England and Wales: 1976-1982", + floor_area=37, + number_habitable_rooms=2, + extension_count=0, + ) + + assert windows_case_1 == 4, f"Expected 4 windows, got {windows_case_1}" + + # Based on data from an EPR that has 7 winows, however two of the windows were very small, having areas of + # 0.21m^2 and 0.3m^2 respectively. We see 6 as a reasonable estimate for the number of windows + windows_case_2 = recommendation_utils.estimate_windows( + property_type="House", + built_form="Mid-Terrace", + construction_age_band="England and Wales: 1950-1966", + floor_area=69, + number_habitable_rooms=4, + extension_count=0, + ) + + assert windows_case_2 == 6, f"Expected 6 windows, got {windows_case_2}" + + # Based on data from an EPR on a bungalow, that has 6 windows. Two of the windows are small, both have a 0.4m^2 area + # and so 5 windows is an acceptable estimate + windows_case_3 = recommendation_utils.estimate_windows( + property_type="Bungalow", + built_form="Mid-Terrace", + construction_age_band="England and Wales: 1967-1975", + floor_area=56, + number_habitable_rooms=3, + extension_count=0, + ) + + assert windows_case_3 == 5, f"Expected 5 windows, got {windows_case_3}" + + # Based on data from an EPR on a end terrace house that has 8 windows. One of the windows is very small, with an + # area of 0.25 m^2 and so 7 windows is an acceptable estimate + windows_case_4 = recommendation_utils.estimate_windows( + property_type="House", + built_form="End-Terrace", + construction_age_band="England and Wales: 1967-1975", + floor_area=77.28, + number_habitable_rooms=4, + extension_count=0, + ) + + assert windows_case_4 == 7, f"Expected 7 windows, got {windows_case_4}" + + # Based on data from an EPR on a Semi-detatched house that has 11 windows based on the associated condition report + # Right now, we estimate 12 windows for this property + windows_case_5 = recommendation_utils.estimate_windows( + property_type="House", + built_form="Semi-Detached", + construction_age_band="England and Wales: 1950-1966", + floor_area=88.4, + number_habitable_rooms=5, + extension_count=0, + ) + + assert windows_case_5 == 12, f"Expected 12 windows, got {windows_case_5}" + + # Based on Khalim's flat which has 3 windows. There is no construction age band on the EPC. The windows are large + # so an estimate of 5 windows is a reasonable estimate + windows_case_6 = recommendation_utils.estimate_windows( + property_type="Flat", + built_form="", + construction_age_band="", + floor_area=100, + number_habitable_rooms=3, + extension_count=0, + ) + + assert windows_case_6 == 5, f"Expected 5 windows, got {windows_case_6}" + + # Based on an EPR semi detatched house though we don't have the exact number of windows. We estimate 10 + windows_case_7 = recommendation_utils.estimate_windows( + property_type="House", + built_form="Semi-Detached", + construction_age_band="England and Wales: 1967-1975", + floor_area=85, + number_habitable_rooms=4, + extension_count=0, + ) + + assert windows_case_7 == 10, f"Expected 10 windows, got {windows_case_7}" + + # Base on Khalim's parents flat + windows_case_8 = recommendation_utils.estimate_windows( + property_type="Flat", + built_form="End-Terrace", + construction_age_band="", + floor_area=50, + number_habitable_rooms=3, + extension_count=0, + ) + + assert windows_case_8 == 5, f"Expected 5 windows, got {windows_case_8}" diff --git a/recommendations/tests/test_roof_recommendations.py b/recommendations/tests/test_roof_recommendations.py index 903f598b..75b7ddb2 100644 --- a/recommendations/tests/test_roof_recommendations.py +++ b/recommendations/tests/test_roof_recommendations.py @@ -1,5 +1,4 @@ from backend.Property import Property -from unittest.mock import Mock from recommendations.RoofRecommendations import RoofRecommendations from recommendations.tests.test_data.materials import materials @@ -7,7 +6,7 @@ from recommendations.tests.test_data.materials import materials class TestRoofRecommendations: def test_loft_insulation_recommendation_no_insulation(self): - property_instance = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance = Property(id=0, address="fake", postcode="fake") property_instance.age_band = "F" property_instance.insulation_floor_area = 100 property_instance.roof = { @@ -32,7 +31,7 @@ class TestRoofRecommendations: assert len(roof_recommender.recommendations) def test_loft_insulation_recommendation_50mm_insulation(self): - property_instance2 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance2 = Property(id=0, address="fake", postcode="fake") property_instance2.age_band = "F" property_instance2.insulation_floor_area = 100 property_instance2.roof = { @@ -54,11 +53,11 @@ class TestRoofRecommendations: assert len(roof_recommender2.recommendations) == 1 - assert roof_recommender2.recommendations[0]["total"] == 1310.56464 + assert roof_recommender2.recommendations[0]["total"] == 1936.9206000000004 assert roof_recommender2.recommendations[0]["new_u_value"] == 0.14 assert roof_recommender2.recommendations[0]["starting_u_value"] == 0.68 - property_instance3 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance3 = Property(id=0, address="fake", postcode="fake") property_instance3.age_band = "F" property_instance3.insulation_floor_area = 100 property_instance3.roof = { @@ -83,7 +82,7 @@ class TestRoofRecommendations: assert roof_recommender3.recommendations[0]["parts"][0]["depth"] == 270 def test_loft_insulation_recommendation_150mm_insulation(self): - property_instance4 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance4 = Property(id=0, address="fake", postcode="fake") property_instance4.age_band = "F" property_instance4.insulation_floor_area = 100 property_instance4.roof = { @@ -105,12 +104,12 @@ class TestRoofRecommendations: assert len(roof_recommender4.recommendations) == 4 - assert roof_recommender4.recommendations[0]["total"] == 788.0544 + assert roof_recommender4.recommendations[0]["total"] == 1128.744 assert roof_recommender4.recommendations[0]["new_u_value"] == 0.15 assert roof_recommender4.recommendations[0]["starting_u_value"] == 0.3 assert roof_recommender4.recommendations[0]["parts"][0]["depth"] == 150 - property_instance5 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance5 = Property(id=0, address="fake", postcode="fake") property_instance5.age_band = "F" property_instance5.insulation_floor_area = 100 property_instance5.roof = { @@ -137,7 +136,7 @@ class TestRoofRecommendations: def test_loft_insulation_recommendation_270mm_insulation(self): # We shouldn't recommend anything in this case - property_instance6 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance6 = Property(id=0, address="fake", postcode="fake") property_instance6.age_band = "F" property_instance6.insulation_floor_area = 100 property_instance6.roof = { @@ -278,7 +277,7 @@ class TestRoofRecommendations: # "Insulate your room roof with 270mm of Example room roof insulation" def test_flat_no_insulation(self): - property_instance11 = Property(id=11, address1="fake", postcode="fake", epc_client=Mock()) + property_instance11 = Property(id=11, address="fake", postcode="fake") property_instance11.age_band = "D" property_instance11.insulation_floor_area = 33.5 property_instance11.perimeter = 24 @@ -307,7 +306,7 @@ class TestRoofRecommendations: "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board" def test_flat_insulated(self): - property_instance12 = Property(id=12, address1="fake", postcode="fake", epc_client=Mock()) + property_instance12 = Property(id=12, address="fake", postcode="fake") property_instance12.age_band = "D" property_instance12.insulation_floor_area = 40 property_instance12.perimeter = 30 @@ -331,7 +330,7 @@ class TestRoofRecommendations: assert not roof_recommender12.recommendations def test_flat_limited_insulation(self): - property_instance13 = Property(id=12, address1="fake", postcode="fake", epc_client=Mock()) + property_instance13 = Property(id=12, address="fake", postcode="fake") property_instance13.age_band = "D" property_instance13.insulation_floor_area = 40 property_instance13.perimeter = 40 @@ -363,7 +362,7 @@ class TestRoofRecommendations: "Insulate the home's flat roof with 150mm of Ecotherm Eco-Versal General Purpose Insulation Board" def test_property_above(self): - property_instance14 = Property(id=0, address1="fake", postcode="fake", epc_client=Mock()) + property_instance14 = Property(id=0, address="fake", postcode="fake") property_instance14.age_band = "F" property_instance14.insulation_floor_area = 100 property_instance14.roof = { diff --git a/recommendations/tests/test_solar_pv_recommendations.py b/recommendations/tests/test_solar_pv_recommendations.py new file mode 100644 index 00000000..f2436cb1 --- /dev/null +++ b/recommendations/tests/test_solar_pv_recommendations.py @@ -0,0 +1,79 @@ +import pytest +from recommendations.SolarPvRecommendations import SolarPvRecommendations +from backend.Property import Property + + +class TestSolarPvRecommendations: + @pytest.fixture + def property_instance_invalid_type(self): + # Setup the property_instance with an invalid property type + property_instance_invalid_type = Property(id=1, address="", postcode="") + property_instance_invalid_type.data = { + "property-type": "InvalidType", "county": "Broxbourne", "photo-supply": None + } + property_instance_invalid_type.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False} + return property_instance_invalid_type + + @pytest.fixture + def property_instance_invalid_roof(self): + # Setup the property_instance with invalid roof type + property_instance_invalid_roof = Property(id=1, address="", postcode="") + property_instance_invalid_roof.data = { + "county": "Huntingdonshire", "property-type": "House", "photo-supply": None + } + property_instance_invalid_roof.roof = {"is_flat": False, "is_pitched": False, "is_roof_room": False} + return property_instance_invalid_roof + + @pytest.fixture + def property_instance_has_solar_pv(self): + # Setup the property_instance without existing solar pv + property_instance_has_solar_pv = Property(id=1, address="", postcode="") + property_instance_has_solar_pv.data = {"photo-supply": "40", "county": "Huntingdonshire", + "property-type": "House"} + property_instance_has_solar_pv.roof = {"is_flat": True} + return property_instance_has_solar_pv + + @pytest.fixture + def property_instance_valid_all(self): + # Setup a valid property_instance that passes all conditions + property_instance_valid_all = Property(id=1, address="", postcode="") + property_instance_valid_all.solar_pv_roof_area = 20 + property_instance_valid_all.solar_pv_percentage = 40 + property_instance_valid_all.data = {"property-type": "House", "photo-supply": None, "county": "Huntingdonshire"} + property_instance_valid_all.roof = {"is_flat": True} + return property_instance_valid_all + + def test_invalid_property_type(self, property_instance_invalid_type): + solar_pv = SolarPvRecommendations(property_instance_invalid_type) + solar_pv.recommend() + assert not solar_pv.recommendation + + def test_invalid_roof_type(self, property_instance_invalid_roof): + solar_pv = SolarPvRecommendations(property_instance_invalid_roof) + solar_pv.recommend() + assert not solar_pv.recommendation + + def test_existing_solar_pv(self, property_instance_has_solar_pv): + solar_pv = SolarPvRecommendations(property_instance_has_solar_pv) + solar_pv.recommend() + assert not solar_pv.recommendation + + def test_valid_all_conditions(self, property_instance_valid_all): + solar_pv = SolarPvRecommendations(property_instance_valid_all) + solar_pv.recommend() + assert solar_pv.recommendation == [ + { + 'parts': [], + 'type': 'solar_pv', + 'description': 'Install a 4 kilowatt-peak (kWp) solar photovoltaic (PV) panel system on the roof', + 'starting_u_value': None, + 'new_u_value': None, + 'sap_points': None, + 'total': 8527.0752, + 'subtotal': 7105.896, + 'vat': 1421.1791999999996, + 'labour_hours': 72, + 'labour_days': 2, + 'photo_supply': 4000 + } + ] diff --git a/recommendations/tests/test_ventilation_recommendations.py b/recommendations/tests/test_ventilation_recommendations.py index 893bb01a..3242b1d1 100644 --- a/recommendations/tests/test_ventilation_recommendations.py +++ b/recommendations/tests/test_ventilation_recommendations.py @@ -1,5 +1,4 @@ from backend.Property import Property -from unittest.mock import Mock from recommendations.VentilationRecommendations import VentilationRecommendations from recommendations.tests.test_data.materials import materials @@ -7,7 +6,7 @@ from recommendations.tests.test_data.materials import materials class TestVentilationRecommendations: def test_natural_ventilation(self): - input_property1 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property1 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property1.data = {"mechanical-ventilation": "natural"} recommender = VentilationRecommendations( @@ -28,7 +27,7 @@ class TestVentilationRecommendations: assert recommender.recommendation[0]["parts"][0]["quantity"] == 2 def test_missing_ventilation(self): - input_property2 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property2 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property2.data = {"mechanical-ventilation": None} recommender2 = VentilationRecommendations( @@ -49,7 +48,7 @@ class TestVentilationRecommendations: assert recommender2.recommendation[0]["parts"][0]["quantity"] == 2 def test_nodata_ventilation(self): - input_property3 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property3 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property3.data = {"mechanical-ventilation": "NO DATA!!"} recommender3 = VentilationRecommendations( @@ -70,7 +69,7 @@ class TestVentilationRecommendations: assert recommender3.recommendation[0]["parts"][0]["quantity"] == 2 def test_existing_ventilation_1(self): - input_property4 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property4 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property4.data = {"mechanical-ventilation": 'mechanical, extract only'} recommender4 = VentilationRecommendations( @@ -86,7 +85,7 @@ class TestVentilationRecommendations: assert recommender4.has_ventilaion def test_existing_ventilation_2(self): - input_property5 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property5 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property5.data = {"mechanical-ventilation": 'mechanical, supply and extract'} recommender5 = VentilationRecommendations( diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py index 0258e592..bfc681f5 100644 --- a/recommendations/tests/test_wall_recommendations.py +++ b/recommendations/tests/test_wall_recommendations.py @@ -231,7 +231,7 @@ class TestWallRecommendationsBase: class TestCavityWallRecommensations: def test_fill_empty_cavity(self): - input_property = Property(id=1, postcode="F4k3", address1="123 fake street", epc_client=Mock()) + input_property = Property(id=1, postcode="F4k3", address="123 fake street") input_property.walls = { 'original_description': 'Cavity wall, as built, no insulation (assumed)', 'clean_description': 'Cavity wall, as built, no insulation', @@ -265,7 +265,7 @@ class TestCavityWallRecommensations: assert np.isclose(recommender.recommendations[1]["total"], 2004.6600000000003) def test_fill_partial_filled_cavity(self): - input_property = Property(id=1, postcode="F4k3", address1="123 fake street", epc_client=Mock()) + input_property = Property(id=1, postcode="F4k3", address="123 fake street") input_property.walls = { 'original_description': 'Cavity wall, as built, partial insulation (assumed)', 'clean_description': 'Cavity wall, as built, partial insulation', @@ -299,7 +299,7 @@ class TestCavityWallRecommensations: assert np.isclose(recommender.recommendations[1]["total"], 1999.9350000000002) def test_system_built_wall(self): - input_property2 = Property(id=1, postcode="F4k3 2", address1="223 fake street", epc_client=Mock()) + input_property2 = Property(id=1, postcode="F4k3 2", address="223 fake street") input_property2.walls = { 'original_description': 'System built, as built, no insulation (assumed)', 'clean_description': 'System built, as built, no insulation', @@ -331,22 +331,22 @@ class TestCavityWallRecommensations: assert len(recommender2.recommendations) == 9 assert recommender2.estimated_u_value == 1 assert np.isclose(recommender2.recommendations[0]["new_u_value"], 0.19) - assert np.isclose(recommender2.recommendations[0]["total"], 15899.9616) + assert np.isclose(recommender2.recommendations[0]["total"], 16429.960320000002) assert recommender2.recommendations[0]["parts"][0]["type"] == "external_wall_insulation" assert recommender2.recommendations[0]["parts"][0]["depth"] == 100 assert np.isclose(recommender2.recommendations[8]["new_u_value"], 0.23) - assert np.isclose(recommender2.recommendations[8]["total"], 10916.3424) + assert np.isclose(recommender2.recommendations[8]["total"], 11292.768) assert recommender2.recommendations[8]["parts"][0]["type"] == "internal_wall_insulation" assert recommender2.recommendations[8]["parts"][0]["depth"] == 72.5 assert np.isclose(recommender2.recommendations[6]["new_u_value"], 0.29) - assert np.isclose(recommender2.recommendations[6]["total"], 10621.934399999998) + assert np.isclose(recommender2.recommendations[6]["total"], 10988.208) assert recommender2.recommendations[6]["parts"][0]["type"] == "internal_wall_insulation" assert recommender2.recommendations[6]["parts"][0]["depth"] == 52.5 def test_timber_frame_wall(self): - input_property3 = Property(id=1, postcode="F4k3 2", address1="223 fake street", epc_client=Mock()) + input_property3 = Property(id=1, postcode="F4k3 2", address="223 fake street") input_property3.walls = { 'original_description': 'Timber frame, as built, no insulation (assumed)', 'clean_description': 'Timber frame, as built, no insulation', @@ -378,17 +378,17 @@ class TestCavityWallRecommensations: assert len(recommender3.recommendations) == 6 assert recommender3.estimated_u_value == 1.9 assert np.isclose(recommender3.recommendations[0]["new_u_value"], 0.2) - assert np.isclose(recommender3.recommendations[0]["total"], 13117.46832) + assert np.isclose(recommender3.recommendations[0]["total"], 13554.717263999999) assert recommender3.recommendations[0]["parts"][0]["type"] == "external_wall_insulation" assert recommender3.recommendations[0]["parts"][0]["depth"] == 100.0 assert np.isclose(recommender3.recommendations[1]["new_u_value"], 0.23) - assert np.isclose(recommender3.recommendations[1]["total"], 34070.50944) + assert np.isclose(recommender3.recommendations[1]["total"], 35206.19308800001) assert recommender3.recommendations[1]["parts"][0]["type"] == "external_wall_insulation" assert recommender3.recommendations[1]["parts"][0]["depth"] == 150.0 def test_granite_or_whinstone_wall(self): - input_property4 = Property(id=1, postcode="F4k3 2", address1="223 fake street", epc_client=Mock()) + input_property4 = Property(id=1, postcode="F4k3 2", address="223 fake street") input_property4.walls = { 'original_description': 'Granite or whinstone, as built, no insulation (assumed)', 'clean_description': 'Granite or whinstone, as built, no insulation', @@ -420,17 +420,17 @@ class TestCavityWallRecommensations: assert len(recommender4.recommendations) == 6 assert recommender4.estimated_u_value == 2.3 assert np.isclose(recommender4.recommendations[0]["new_u_value"], 0.21) - assert np.isclose(recommender4.recommendations[0]["total"], 28562.514352) + assert np.isclose(recommender4.recommendations[0]["total"], 29547.42864) assert recommender4.recommendations[0]["parts"][0]["type"] == "external_wall_insulation" assert recommender4.recommendations[0]["parts"][0]["depth"] == 100 assert np.isclose(recommender4.recommendations[1]["new_u_value"], 0.23) - assert np.isclose(recommender4.recommendations[1]["total"], 74186.52678400002) + assert np.isclose(recommender4.recommendations[1]["total"], 76744.68288000001) assert recommender4.recommendations[1]["parts"][0]["type"] == "external_wall_insulation" assert recommender4.recommendations[1]["parts"][0]["depth"] == 150 def test_cob_wall(self): - input_property5 = Property(id=1, postcode="F4k3 2", address1="223 fake street", epc_client=Mock()) + input_property5 = Property(id=1, postcode="F4k3 2", address="223 fake street") input_property5.walls = { 'original_description': 'Cob, as built', 'clean_description': 'Cob, as built', @@ -462,17 +462,17 @@ class TestCavityWallRecommensations: assert len(recommender5.recommendations) == 5 assert recommender5.estimated_u_value == 0.8 assert np.isclose(recommender5.recommendations[0]["new_u_value"], 0.29) - assert np.isclose(recommender5.recommendations[0]["total"], 8665.040384000002) + assert np.isclose(recommender5.recommendations[0]["total"], 8963.834880000002) assert recommender5.recommendations[0]["parts"][0]["type"] == "external_wall_insulation" assert recommender5.recommendations[0]["parts"][0]["depth"] == 50 assert np.isclose(recommender5.recommendations[3]["new_u_value"], 0.26) - assert np.isclose(recommender5.recommendations[3]["total"], 20078.742992) + assert np.isclose(recommender5.recommendations[3]["total"], 20771.11344) assert recommender5.recommendations[3]["parts"][0]["type"] == "internal_wall_insulation" assert recommender5.recommendations[3]["parts"][0]["depth"] == 100 def test_sandstone_or_limestone_wall(self): - input_property6 = Property(id=1, postcode="F4k3 6", address1="623 fake street", epc_client=Mock()) + input_property6 = Property(id=1, postcode="F4k3 6", address="623 fake street") input_property6.walls = { 'original_description': 'Sandstone or limestone, as built, no insulation (assumed)', 'clean_description': 'Sandstone or limestone, as built, no insulation', @@ -504,16 +504,16 @@ class TestCavityWallRecommensations: assert len(recommender6.recommendations) == 9 assert recommender6.estimated_u_value == 1 assert np.isclose(recommender6.recommendations[0]["new_u_value"], 0.19) - assert np.isclose(recommender6.recommendations[0]["total"], 44829.0584) + assert np.isclose(recommender6.recommendations[0]["total"], 46374.888000000006) assert recommender6.recommendations[0]["parts"][0]["type"] == "external_wall_insulation" assert recommender6.recommendations[0]["parts"][0]["depth"] == 100 assert np.isclose(recommender6.recommendations[2]["new_u_value"], 0.21) - assert np.isclose(recommender6.recommendations[2]["total"], 116436.25280000002) + assert np.isclose(recommender6.recommendations[2]["total"], 120451.29600000002) assert recommender6.recommendations[2]["parts"][0]["type"] == "external_wall_insulation" assert recommender6.recommendations[2]["parts"][0]["depth"] == 150 assert np.isclose(recommender6.recommendations[4]["new_u_value"], 0.28) - assert np.isclose(recommender6.recommendations[4]["total"], 91267.0136) + assert np.isclose(recommender6.recommendations[4]["total"], 94414.15199999999) assert recommender6.recommendations[4]["parts"][0]["type"] == "internal_wall_insulation" assert recommender6.recommendations[4]["parts"][0]["depth"] == 100 diff --git a/recommendations/tests/test_window_recommendations.py b/recommendations/tests/test_window_recommendations.py new file mode 100644 index 00000000..664a1e39 --- /dev/null +++ b/recommendations/tests/test_window_recommendations.py @@ -0,0 +1,252 @@ +from recommendations.WindowsRecommendations import WindowsRecommendations +from backend.Property import Property +from recommendations.tests.test_data.materials import materials + + +class TestWindowRecommendations: + + def test_fully_single_glazed(self): + """ + For this property, we expect all windows to be single glazed and should recommend full double glazing + :return: + """ + + property_1 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 0, + "uprn": 0 + } + ) + property_1.windows = { + 'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': 'full', + 'glazing_type': 'single', + 'no_data': False + } + property_1.number_of_windows = 7 + + recommender = WindowsRecommendations(property_instance=property_1, materials=materials) + + assert not recommender.recommendation + + recommender.recommend() + + assert recommender.recommendation == [ + {'parts': [], 'type': 'windows_glazing', 'description': 'Install double glazing to all windows', + 'starting_u_value': None, 'new_u_value': None, 'sap_points': None, 'total': 5721.943248, + 'subtotal': 4768.28604, 'vat': 953.6572080000001, 'contingency': 340.59186, 'preliminaries': 340.59186, + 'material': 1275.75, 'profit': 681.18372, 'labour_hours': 45.5, 'labour_cost': 994.8624, + 'labour_days': 2.84375, 'is_secondary_glazing': False}] + + def test_partial_double_glazed(self): + """ + For this property, the double glazing is describes as partial, therefore we recommend completion of + double glazing + :return: + """ + + property_2 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 33, + "uprn": 0 + } + ) + property_2.windows = {'original_description': 'Mostly double glazing', 'has_glazing': True, + 'glazing_coverage': 'most', + 'glazing_type': 'double', 'no_data': False} + property_2.number_of_windows = 7 + + recommender2 = WindowsRecommendations(property_instance=property_2, materials=materials) + + assert not recommender2.recommendation + + recommender2.recommend() + + assert recommender2.recommendation == [ + {'parts': [], 'type': 'windows_glazing', 'description': 'Install double glazing to the remaining windows', + 'starting_u_value': None, 'new_u_value': None, 'sap_points': None, 'total': 4087.10232, + 'subtotal': 3405.9186, 'vat': 681.18372, 'contingency': 243.2799, 'preliminaries': 243.2799, + 'material': 911.25, 'profit': 486.5598, 'labour_hours': 32.5, 'labour_cost': 710.6160000000001, + 'labour_days': 2.03125, 'is_secondary_glazing': False}] + + def test_fully_double_glazed(self): + """ + This property has full double glazing so we shouldn't recommend anything + :return: + """ + + property_3 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 80, + "uprn": 0 + } + ) + property_3.windows = {'original_description': 'Fully double glazed', 'has_glazing': True, + 'glazing_coverage': 'full', + 'glazing_type': 'double', 'no_data': False} + property_3.number_of_windows = 7 + + recommender3 = WindowsRecommendations(property_instance=property_3, materials=materials) + + assert not recommender3.recommendation + + recommender3.recommend() + + assert not recommender3.recommendation + + def test_fully_secondary_glazed(self): + property_4 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 100, + "uprn": 0 + } + ) + property_4.windows = {'original_description': 'Full secondary glazing', 'has_glazing': True, + 'glazing_coverage': 'full', + 'glazing_type': 'secondary', 'no_data': False} + property_4.number_of_windows = 7 + + recommender4 = WindowsRecommendations(property_instance=property_4, materials=materials) + + assert not recommender4.recommendation + + recommender4.recommend() + + assert not recommender4.recommendation + + def test_partial_secondary_glazing(self): + property_5 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 50, + "uprn": 0 + } + ) + property_5.windows = {'original_description': 'Partial secondary glazing', 'has_glazing': True, + 'glazing_coverage': 'partial', + 'glazing_type': 'secondary', 'no_data': False} + property_5.number_of_windows = 7 + + recommender5 = WindowsRecommendations(property_instance=property_5, materials=materials) + + assert not recommender5.recommendation + + recommender5.recommend() + + assert recommender5.recommendation == [ + {'parts': [], 'type': 'windows_glazing', + 'description': 'Install secondary glazing to the remaining windows', + 'starting_u_value': None, 'new_u_value': None, 'sap_points': None, 'total': 1089.893952, + 'subtotal': 908.24496, 'vat': 181.64899200000002, 'contingency': 64.87464, 'preliminaries': 64.87464, + 'material': 729.0, 'profit': 129.74928, 'labour_hours': 13.0, 'labour_cost': 568.4928, + 'labour_days': 0.8125, 'is_secondary_glazing': True}] + + def test_single_glazed_restricted_measures(self): + property_6 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 0, + "uprn": 0 + } + ) + property_6.windows = {'original_description': 'Single glazed', 'has_glazing': False, 'glazing_coverage': None, + 'glazing_type': 'single', + 'no_data': False} + property_6.number_of_windows = 7 + property_6.restricted_measures = True + property_6.is_heritage = True + + recommender6 = WindowsRecommendations(property_instance=property_6, materials=materials) + + assert not recommender6.recommendation + + recommender6.recommend() + + assert recommender6.recommendation == [ + {'parts': [], 'type': 'windows_glazing', + 'description': 'Install secondary glazing to all windows. Secondary ' + 'glazing recommended due to herigate building status', + 'starting_u_value': None, 'new_u_value': None, 'sap_points': None, + 'total': 1907.314416, 'subtotal': 1589.42868, 'vat': 317.885736, + 'contingency': 113.53062, 'preliminaries': 113.53062, + 'material': 1275.75, 'profit': 227.06124, 'labour_hours': 22.75, + 'labour_cost': 994.8624, 'labour_days': 1.421875, 'is_secondary_glazing': True} + ] + + def test_full_triple_glazed(self): + property_7 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 100, + "uprn": 0 + } + ) + property_7.windows = {'original_description': 'Fully triple glazed', 'has_glazing': True, + 'glazing_coverage': 'full', + 'glazing_type': 'triple', 'no_data': False} + property_7.number_of_windows = 7 + + recommender7 = WindowsRecommendations(property_instance=property_7, materials=materials) + + assert not recommender7.recommendation + + recommender7.recommend() + + assert not recommender7.recommendation + + def test_partial_triple_glazed(self): + """ + We should just recommend double glazing to the remaining windows, since it's a cheaper option + """ + + property_8 = Property( + id=1, + postcode='1', + address='1', + data={ + "county": "Wychavon", + "multi-glaze-proportion": 80, + "uprn": 1 + } + ) + property_8.windows = {'original_description': 'Mostly triple glazing', 'has_glazing': True, + 'glazing_coverage': 'most', + 'glazing_type': 'triple', 'no_data': False} + property_8.number_of_windows = 7 + + recommender8 = WindowsRecommendations(property_instance=property_8, materials=materials) + + assert not recommender8.recommendation + + recommender8.recommend() + + assert recommender8.recommendation == [ + {'parts': [], 'type': 'windows_glazing', 'description': 'Install double glazing to the remaining windows', + 'starting_u_value': None, 'new_u_value': None, 'sap_points': None, 'total': 1634.840928, + 'subtotal': 1362.36744, 'vat': 272.47348800000003, 'contingency': 97.31196, 'preliminaries': 97.31196, + 'material': 364.5, 'profit': 194.62392, 'labour_hours': 13.0, 'labour_cost': 284.2464, + 'labour_days': 0.8125, 'is_secondary_glazing': False}]