diff --git a/backend/Property.py b/backend/Property.py index a59103df..57e653a7 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -3,6 +3,7 @@ import re import os import pandas as pd +from etl.epc.DataProcessor import DataProcessor from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet from epc_api.client import EpcClient @@ -50,6 +51,7 @@ class Property(Definitions): self.uprn = None self.full_sap_epc = None self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None + self.restricted_measures = False self.year_built = None self.number_of_rooms = None @@ -139,7 +141,7 @@ class Property(Definitions): """ ventilation = self.data["mechanical-ventilation"] - # perform some simple cleaning - when checking 300k property_change, the only unique values were + # perform some simple cleaning - when checking 300k epc, the only unique values were # {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'} if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]: ventilation = None @@ -157,7 +159,7 @@ class Property(Definitions): - solar_pv This is based on the "photo-supply" field in the EPC data. - When checking 100k property_change, either the value was "" or a stringified number + When checking 100k epc, either the value was "" or a stringified number """ solar_pv = self.data["photo-supply"] @@ -287,7 +289,8 @@ class Property(Definitions): if not self.data: raise ValueError("Property does not contain data") - self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]] + construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"]) + self.age_band = england_wales_age_band_lookup.get(construction_age_band) def set_spatial(self, spatial: pd.DataFrame): """ @@ -295,8 +298,11 @@ class Property(Definitions): :param spatial: Dataframe, containing the spatial data for the property """ self.in_conservation_area = spatial["conservation_status"].values[0] - self.is_listed = spatial["is_listed"].values[0] - self.is_heritage = spatial["is_heritage"].values[0] + self.is_listed = spatial["is_listed_building"].values[0] + self.is_heritage = spatial["is_heritage_building"].values[0] + + if self.in_conservation_area | self.is_listed | self.is_heritage: + self.restricted_measures = True def set_year_built(self): """ @@ -476,7 +482,7 @@ class Property(Definitions): self.floor_area = float(self.data["total-floor-area"]) - def get_spatial_data(self): + def get_spatial_data(self, uprn_filenames): """ Given a property's UPRN, this method will pull the associated spatial data from s3 @@ -486,13 +492,8 @@ class Property(Definitions): if self.uprn is None: raise ValueError("URPN is not set, run search_address_epc") - # We get the filenames - filenames = read_dataframe_from_s3_parquet( - bucket_name=DATA_BUCKET, file_key="spatial/filename_meta.parquet" - ) - # We get the file name for the uprn - filtered_df = filenames[(filenames['lower'] <= self.uprn) & (filenames['upper'] >= self.uprn)] + filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)] if filtered_df.empty: logger.warning("Could not find file containing UPRNS") return None diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index d2e5b4f3..24dfebda 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -27,14 +27,15 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_par from backend.ml_models.sap_change_model.api import SAPChangeModelAPI from backend.Property import Property -from etl.property_change.DataProcessor import DataProcessor -from etl.property_change.settings import COLUMNS_TO_MERGE_ON +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON from recommendations.FloorRecommendations import FloorRecommendations from recommendations.optimiser.CostOptimiser import CostOptimiser from recommendations.optimiser.GainOptimiser import GainOptimiser from recommendations.optimiser.optimiser_functions import prepare_input_measures from recommendations.WallRecommendations import WallRecommendations from utils.logger import setup_logger +from utils.s3 import read_dataframe_from_s3_parquet logger = setup_logger() @@ -55,11 +56,12 @@ async def trigger_plan(body: PlanTriggerRequest): try: session.begin() logger.info("Getting the inputs") - # Read in the trigger file from s3 - bucket_name = get_settings().PLAN_TRIGGER_BUCKET epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN) + plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) + uprn_filenames = read_dataframe_from_s3_parquet( + bucket_name=get_settings().PLAN_TRIGGER_BUCKET, file_key="spatial/filename_meta.parquet" + ) - plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path) input_properties = [] for config in plan_input: # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly @@ -96,7 +98,7 @@ async def trigger_plan(body: PlanTriggerRequest): for p in input_properties: p.search_address_epc() p.set_year_built() - p.get_spatial_data() + p.get_spatial_data(uprn_filenames) # The materials data could be cached or local so we don't need to make # consistent requests to the backend for @@ -110,7 +112,7 @@ async def trigger_plan(body: PlanTriggerRequest): materials_by_type = filter_materials(materials) cleaned = get_cleaned() - logger.info("Getting components and property_change recommendations") + logger.info("Getting components and epc recommendations") # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers # in as a dependency and then the optimisers can take the input measures in as part of the setup() method diff --git a/etl/property_change/DataProcessor.py b/etl/epc/DataProcessor.py similarity index 89% rename from etl/property_change/DataProcessor.py rename to etl/epc/DataProcessor.py index afa0682d..357faa08 100644 --- a/etl/property_change/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -2,7 +2,7 @@ from pathlib import Path import numpy as np import pandas as pd from BaseUtility import Definitions -from etl.property_change.settings import ( +from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, @@ -20,6 +20,40 @@ from etl.property_change.settings import ( from typing import List +# These lookups are used to clean the construction age band +bounds_map = { + "England and Wales: before 1900": {"l": 0, "u": 1899}, + "England and Wales: 1930-1949": {"l": 1930, "u": 1949}, + "England and Wales: 1900-1929": {"l": 1900, "u": 1929}, + "England and Wales: 1950-1966": {"l": 1950, "u": 1966}, + "England and Wales: 1967-1975": {"l": 1967, "u": 1975}, + "England and Wales: 1976-1982": {"l": 1976, "u": 1982}, + "England and Wales: 1983-1990": {"l": 1983, "u": 1990}, + "England and Wales: 1991-1995": {"l": 1991, "u": 1995}, + "England and Wales: 1996-2002": {"l": 1996, "u": 2002}, + "England and Wales: 2003-2006": {"l": 2003, "u": 2006}, + "England and Wales: 2007-2011": {"l": 2007, "u": 2011}, + "England and Wales: 2012 onwards": {"l": 2012, "u": 3000}, +} + +remap = { + "England and Wales: 2007 onwards": "England and Wales: 2007-2011" +} + +expanded_map = { + i: [ + label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l']) + ][0] for i in range(0, 3001) +} + + +def is_int(x): + try: + int(x) + return True + except: + return False + class DataProcessor: """ @@ -45,66 +79,36 @@ class DataProcessor: def insert_data(self, data: pd.DataFrame) -> None: self.data = data + @staticmethod + def clean_construction_age_band(x): + # Firstly, we check if it's an error value + if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]: + return x + + # Next, we check if it's a value in our map + if bounds_map.get(x): + return x + + # We check if it's a standard remap value + remap_value = remap.get(x, None) + if remap_value: + return remap_value + + # We check if it's a number + if is_int(x): + x_int = int(x) + return expanded_map[x_int] + + raise NotImplementedError("Not handled the case for value %s" % x) + def standardise_construction_age_band(self): """ This function will tidy up some of the non-standard values that are populated in the construction age band, which is useful for cleaning """ - bounds_map = { - "England and Wales: before 1900": {"l": 0, "u": 1899}, - "England and Wales: 1930-1949": {"l": 1930, "u": 1949}, - "England and Wales: 1900-1929": {"l": 1900, "u": 1929}, - "England and Wales: 1950-1966": {"l": 1950, "u": 1966}, - "England and Wales: 1967-1975": {"l": 1967, "u": 1975}, - "England and Wales: 1976-1982": {"l": 1976, "u": 1982}, - "England and Wales: 1983-1990": {"l": 1983, "u": 1990}, - "England and Wales: 1991-1995": {"l": 1991, "u": 1995}, - "England and Wales: 1996-2002": {"l": 1996, "u": 2002}, - "England and Wales: 2003-2006": {"l": 2003, "u": 2006}, - "England and Wales: 2007-2011": {"l": 2007, "u": 2011}, - "England and Wales: 2012 onwards": {"l": 2012, "u": 3000}, - } - - remap = { - "England and Wales: 2007 onwards": "England and Wales: 2007-2011" - } - - expanded_map = { - i: [ - label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l']) - ][0] for i in range(0, 3001) - } - - def is_int(x): - try: - int(x) - return True - except: - return False - - def clean_construction_age_band(x): - # Firstly, we check if it's an error value - if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]: - return x - - # Next, we check if it's a value in our map - if bounds_map.get(x): - return x - - # We check if it's a standard remap value - remap_value = remap.get(x, None) - if remap_value: - return remap_value - - # We check if it's a number - if is_int(x): - x_int = int(x) - return expanded_map[x_int] - - raise NotImplementedError("Not handled the case for value %s" % x) self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply( - lambda x: clean_construction_age_band(x) + lambda x: self.clean_construction_age_band(x) ) self.data = self.data[ @@ -347,7 +351,7 @@ class DataProcessor: cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE") - # If there still is na values, use average across all property_change in consituecy + # If there still is na values, use average across all epc in consituecy cleaning_averages_filled[variable] = cleaning_averages_filled[ variable ].fillna(cleaning_averages_filled[variable].mean()) diff --git a/etl/property_change/FeatureProcessor.py b/etl/epc/FeatureProcessor.py similarity index 100% rename from etl/property_change/FeatureProcessor.py rename to etl/epc/FeatureProcessor.py diff --git a/etl/property_change/__init__.py b/etl/epc/__init__.py similarity index 100% rename from etl/property_change/__init__.py rename to etl/epc/__init__.py diff --git a/etl/property_change/app.py b/etl/epc/property_change_app.py similarity index 99% rename from etl/property_change/app.py rename to etl/epc/property_change_app.py index 605c9a93..8b5a5088 100644 --- a/etl/property_change/app.py +++ b/etl/epc/property_change_app.py @@ -4,7 +4,7 @@ from tqdm import tqdm import msgpack from pathlib import Path -from etl.property_change.settings import ( +from etl.epc.settings import ( MANDATORY_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, @@ -14,7 +14,7 @@ from etl.property_change.settings import ( EARLIEST_EPC_DATE, CARBON_RESPONSE, ) -from etl.property_change.DataProcessor import DataProcessor +from etl.epc.DataProcessor import DataProcessor from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from recommendations.rdsap_tables import england_wales_age_band_lookup from recommendations.recommendation_utils import ( diff --git a/etl/property_change/requirements.txt b/etl/epc/requirements.txt similarity index 100% rename from etl/property_change/requirements.txt rename to etl/epc/requirements.txt diff --git a/etl/property_change/settings.py b/etl/epc/settings.py similarity index 100% rename from etl/property_change/settings.py rename to etl/epc/settings.py diff --git a/etl/epc_clean/app.py b/etl/epc_clean/app.py index a7ea7875..d23e3f84 100644 --- a/etl/epc_clean/app.py +++ b/etl/epc_clean/app.py @@ -4,7 +4,7 @@ import pandas as pd import msgpack from etl.epc_clean.EpcClean import EpcClean -from etl.property_change.settings import EARLIEST_EPC_DATE +from etl.epc.settings import EARLIEST_EPC_DATE from pathlib import Path from utils.s3 import save_data_to_s3 @@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev") def app(): """ For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API - and produce a dataset of cleaned fields so that when we get new property_change, we can quickly + and produce a dataset of cleaned fields so that when we get new epc, we can quickly sanitise any description data Currently, this application is just run on a local machine diff --git a/etl/spatial/BoreholeClient.py b/etl/spatial/BoreholeClient.py index cd7df667..24399775 100644 --- a/etl/spatial/BoreholeClient.py +++ b/etl/spatial/BoreholeClient.py @@ -56,7 +56,7 @@ class BoreholeClient: # EXAMPLE # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of - # entries in here if possible before we produce any form of comparison between our property_change, to infer + # entries in here if possible before we produce any form of comparison between our epc, to infer # the distance from the property to the nearest borehole # Let's take a sample diff --git a/etl/wall_area/app.py b/etl/wall_area/app.py index 9eef73f8..41b1b159 100644 --- a/etl/wall_area/app.py +++ b/etl/wall_area/app.py @@ -1,5 +1,5 @@ """ -This script produces the dataset used to model the wall area of property_change, which is used to estimate the cost +This script produces the dataset used to model the wall area of epc, which is used to estimate the cost of insulation measures within homes """ import os diff --git a/infrastructure/terraform/main.tf b/infrastructure/terraform/main.tf index 5f4fdba1..79fe015e 100644 --- a/infrastructure/terraform/main.tf +++ b/infrastructure/terraform/main.tf @@ -83,7 +83,7 @@ resource "aws_db_instance" "default" { publicly_accessible = true } -# Set up the bucket that recieve the csv uploads of property_change to be retrofit +# Set up the bucket that recieve the csv uploads of epc to be retrofit module "s3_presignable_bucket" { source = "./modules/s3_presignable_bucket" bucketname = "retrofit-plan-inputs-${var.stage}" diff --git a/input_property_list.csv b/input_property_list.csv index 097a6b23..dc677c88 100644 --- a/input_property_list.csv +++ b/input_property_list.csv @@ -7,6 +7,6 @@ Flat 3 Frederick Building,N1 4BD,,,,, Flat 4 Frederick Building,N1 4BD,,,,, "Flat 28, 22 Adelina Grove",E1 3BX,,,,, "Flat 39, 239 Long Lane",SE1 4PT,,,,, -"1, Westview, Someday",LE14 2QH,This property has an unfilled cavity,,,, +"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,, "59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,, 88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,, \ No newline at end of file diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index 942dc8fa..a6f100be 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -91,8 +91,8 @@ class WallRecommendations(Definitions): if self.property.walls["thermal_transmittance_unit"] != self.U_VALUE_UNIT: raise NotImplementedError("Haven't handled the case of other u value units yet") - # TODO: It's worth thinking about this logic because depending on when property_change were built, - # they're likely to be of a certain standard. E.g. property_change built within a certain time + # TODO: It's worth thinking about this logic because depending on when epc were built, + # they're likely to be of a certain standard. E.g. epc built within a certain time # period are likely to have cavity walls # We can't detect it's a cavity wall, but it was built after 1990 so likely built with insulation already diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py index f58ec02e..0e26d9a9 100644 --- a/recommendations/tests/test_wall_recommendations.py +++ b/recommendations/tests/test_wall_recommendations.py @@ -230,7 +230,7 @@ class TestWallRecommendations: The important data for this recommendation is: - u value of 0.16 - property built in 2014 - Since property_change built after 1990 are typically built with insulation and this property + Since epc built after 1990 are typically built with insulation and this property already has really good insulation, we do NOT recommend any measures for this property """ input_properties[0].year_built = 2014