From a5ef3b8483ee1d096f0e62a0b53611359de363a0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 20 Jul 2023 18:51:55 +0100 Subject: [PATCH] extending description cleaning for expanded data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- model_data/app.py | 83 +++++++++++++++++--- model_data/epc_attributes/FloorAttributes.py | 11 +++ 4 files changed, 84 insertions(+), 14 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 81384fa9..ac61a988 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -6,7 +6,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..242c02bb 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/app.py b/model_data/app.py index 74fa5c15..fe065f26 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -72,6 +72,7 @@ def handler(): # TODO: Do this at a constituency level constituencies = {p.data["constituency"] for p in input_properties} property_types = ["bungalow", "flat", "house", "maisonette", "park home"] + floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"] # We pull properties from local authorities, by property type. This will allow us to build # a dataset of up to 10k properties per local authority/property type combination @@ -82,24 +83,28 @@ def handler(): data = [] for c in tqdm(constituencies): for pt in property_types: - data.extend( - pagenated_epc_download( - client=epc_client, - params={ - "constituency": c, - "property-type": pt, - "from-month": 8, - "from-year": 2014, - }, - page_size=5000, - n_pages=10, + for fa in floor_areas: + data.extend( + pagenated_epc_download( + client=epc_client, + params={ + "constituency": c, + "property-type": pt, + "from-month": 8, + "from-year": 2014, + "floor-area": fa, + }, + page_size=5000, + n_pages=10, + ) ) - ) # Incorporate input data into cleaning cleaner = EpcClean(data + [p.data for p in input_properties]) cleaner.clean() + z = [x for x in data if x["floor-description"] == "(anheddiad arall islaw)"] + address_meta = [ { "postcode": x["postcode"].upper(), @@ -137,6 +142,60 @@ def handler(): uvalue_estimates = UvalueEstimations(data=data) uvalue_estimates.get_estimates(cleaner=cleaner) + x = {'low-energy-fixed-light-count': '', 'address': 'Flat 28, 22, Adelina Grove', 'uprn-source': 'Address Matched', + 'floor-height': '', 'heating-cost-potential': '668', 'unheated-corridor-length': '7.73', + 'hot-water-cost-potential': '190', 'construction-age-band': 'England and Wales: 1991-1995', + 'potential-energy-rating': 'D', 'mainheat-energy-eff': 'Very Poor', 'windows-env-eff': 'Average', + 'lighting-energy-eff': 'Average', 'environment-impact-potential': '46', + 'glazed-type': 'double glazing, unknown install date', 'heating-cost-current': '1081', 'address3': '', + 'mainheatcont-description': 'No time or thermostatic control of room temperature', + 'sheating-energy-eff': 'N/A', 'property-type': 'Flat', 'local-authority-label': 'Tower Hamlets', + 'fixed-lighting-outlets-count': '', 'energy-tariff': 'dual', 'mechanical-ventilation': 'natural', + 'hot-water-cost-current': '190', 'county': 'Greater London Authority', 'postcode': 'E1 3BX', + 'solar-water-heating-flag': 'N', 'constituency': 'E14000555', 'co2-emissions-potential': '5.2', + 'number-heated-rooms': '2', 'floor-description': '(another dwelling below)', + 'energy-consumption-potential': '301', 'local-authority': 'E09000030', 'built-form': 'Semi-Detached', + 'number-open-fireplaces': '0', 'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', + 'inspection-date': '2018-09-05', 'mains-gas-flag': 'N', 'co2-emiss-curr-per-floor-area': '53', + 'address1': 'Flat 28', 'heat-loss-corridor': 'unheated corridor', 'flat-storey-count': '', + 'constituency-label': 'Bethnal Green and Bow', 'roof-energy-eff': 'Average', 'total-floor-area': '103.0', + 'building-reference-number': '4441803568', 'environment-impact-current': '44', 'co2-emissions-current': '5.5', + 'roof-description': 'Pitched, insulated (assumed)', 'floor-energy-eff': 'NO DATA!', + 'number-habitable-rooms': '2', 'address2': '22, Adelina Grove', 'hot-water-env-eff': 'Poor', + 'posttown': 'LONDON', 'mainheatc-energy-eff': 'Very Poor', 'main-fuel': 'electricity (not community)', + 'lighting-env-eff': 'Average', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', + 'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 25% of fixed outlets', + 'roof-env-eff': 'Average', 'walls-energy-eff': 'Good', 'photo-supply': '', 'lighting-cost-potential': '84', + 'mainheat-env-eff': 'Very Poor', 'multi-glaze-proportion': '100', 'main-heating-controls': '2701', + 'lodgement-datetime': '2018-09-06 17:25:59', 'flat-top-storey': 'Y', 'current-energy-rating': 'E', + 'secondheat-description': 'None', 'walls-env-eff': 'Good', 'transaction-type': 'rental (private)', + 'uprn': '6032920', 'current-energy-efficiency': '48', 'energy-consumption-current': '316', + 'mainheat-description': 'Electric ceiling heating', 'lighting-cost-current': '147', + 'lodgement-date': '2018-09-06', 'extension-count': '1', 'mainheatc-env-eff': 'Very Poor', + 'lmk-key': '175926409402018090617255958380158', 'wind-turbine-count': '0', 'tenure': 'rental (private)', + 'floor-level': '4th', 'potential-energy-efficiency': '67', 'hot-water-energy-eff': 'Average', + 'low-energy-lighting': '25', 'walls-description': 'Solid brick, as built, insulated (assumed)', + 'hotwater-description': 'Electric immersion, off-peak'} + from utils.uvalue_estimates import classify_decile_newvalues + total_floor_area_group_decile = UvalueEstimations.classify_decile_newvalues( + decile_boundaries=uvalue_estimates.walls_decile_data["decile_boundaries"], + decile_labels=uvalue_estimates.walls_decile_data["decile_labels"], + new_values=[float(x["total-floor-area"])], + )[0] + + u_value_estimate = uvalue_estimates.walls[ + (uvalue_estimates.walls["local-authority"] == x["local-authority"]) & + (uvalue_estimates.walls["property-type"] == x["property-type"]) & + (uvalue_estimates.walls["built-form"] == x["built-form"]) & + (uvalue_estimates.walls["walls-energy-eff"] == x["walls-energy-eff"]) & + (uvalue_estimates.walls["walls-env-eff"] == x["walls-env-eff"]) & + (uvalue_estimates.walls["total-floor-area_group"] == total_floor_area_group_decile) + ] + + uvalue_estimates.walls[ + uvalue_estimates.walls + ] + # all_data = { # "input_properties": input_properties, # "cleaner": cleaner, diff --git a/model_data/epc_attributes/FloorAttributes.py b/model_data/epc_attributes/FloorAttributes.py index b649cdd2..0d8ea493 100644 --- a/model_data/epc_attributes/FloorAttributes.py +++ b/model_data/epc_attributes/FloorAttributes.py @@ -12,12 +12,23 @@ class FloorAttributes(BaseUtility): OBSERVED_ERRORS = ["Conservatory"] + WELSH_TEXT = { + "(anheddiad arall islaw)": "(another dwelling below)", + } + def __init__(self, description: str): self.description: str = description.lower() self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or ( description in self.OBSERVED_ERRORS) + # Try and perform a translation, incase it's in welsh + translation = self.WELSH_TEXT.get(self.description) + + if translation: + self.nodata = False + self.description = translation + if not self.nodata and not any( rt in self.description for rt in self.FLOOR_TYPES + self.DWELLING_BELOW + ["average thermal transmittance"]