implemented property age band cleaning

2026-07-27 23:35:01 +00:00 · 2023-10-05 18:20:52 +01:00 · 2023-10-05 18:20:52 +01:00 · 2b783c8d1a
commit 2b783c8d1a
parent 6cc84e95bf
15 changed files with 92 additions and 85 deletions
--- a/backend/Property.py
+++ b/backend/Property.py
@ -3,6 +3,7 @@ import re
 import os
 import pandas as pd

+from etl.epc.DataProcessor import DataProcessor
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
 from epc_api.client import EpcClient
@ -50,6 +51,7 @@ class Property(Definitions):
        self.uprn = None
        self.full_sap_epc = None
        self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
+        self.restricted_measures = False
        self.year_built = None
        self.number_of_rooms = None

@ -139,7 +141,7 @@ class Property(Definitions):
        """

        ventilation = self.data["mechanical-ventilation"]
-        # perform some simple cleaning - when checking 300k property_change, the only unique values were
+        # perform some simple cleaning - when checking 300k epc, the only unique values were
        # {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
        if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
            ventilation = None
@ -157,7 +159,7 @@ class Property(Definitions):
        - solar_pv
        This is based on the "photo-supply" field in the EPC data.

-        When checking 100k property_change, either the value was "" or a stringified number
+        When checking 100k epc, either the value was "" or a stringified number
        """

        solar_pv = self.data["photo-supply"]
@ -287,7 +289,8 @@ class Property(Definitions):
        if not self.data:
            raise ValueError("Property does not contain data")

-        self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
+        construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
+        self.age_band = england_wales_age_band_lookup.get(construction_age_band)

    def set_spatial(self, spatial: pd.DataFrame):
        """
@ -295,8 +298,11 @@ class Property(Definitions):
        :param spatial:  Dataframe, containing the spatial data for the property
        """
        self.in_conservation_area = spatial["conservation_status"].values[0]
-        self.is_listed = spatial["is_listed"].values[0]
-        self.is_heritage = spatial["is_heritage"].values[0]
+        self.is_listed = spatial["is_listed_building"].values[0]
+        self.is_heritage = spatial["is_heritage_building"].values[0]
+
+        if self.in_conservation_area | self.is_listed | self.is_heritage:
+            self.restricted_measures = True

    def set_year_built(self):
        """
@ -476,7 +482,7 @@ class Property(Definitions):

        self.floor_area = float(self.data["total-floor-area"])

-    def get_spatial_data(self):
+    def get_spatial_data(self, uprn_filenames):

        """
        Given a property's UPRN, this method will pull the associated spatial data from s3
@ -486,13 +492,8 @@ class Property(Definitions):
        if self.uprn is None:
            raise ValueError("URPN is not set, run search_address_epc")

-        # We get the filenames
-        filenames = read_dataframe_from_s3_parquet(
-            bucket_name=DATA_BUCKET, file_key="spatial/filename_meta.parquet"
-        )
-
        # We get the file name for the uprn
-        filtered_df = filenames[(filenames['lower'] <= self.uprn) & (filenames['upper'] >= self.uprn)]
+        filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
        if filtered_df.empty:
            logger.warning("Could not find file containing UPRNS")
            return None
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -27,14 +27,15 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_par

 from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
 from backend.Property import Property
-from etl.property_change.DataProcessor import DataProcessor
-from etl.property_change.settings import COLUMNS_TO_MERGE_ON
+from etl.epc.DataProcessor import DataProcessor
+from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from recommendations.FloorRecommendations import FloorRecommendations
 from recommendations.optimiser.CostOptimiser import CostOptimiser
 from recommendations.optimiser.GainOptimiser import GainOptimiser
 from recommendations.optimiser.optimiser_functions import prepare_input_measures
 from recommendations.WallRecommendations import WallRecommendations
 from utils.logger import setup_logger
+from utils.s3 import read_dataframe_from_s3_parquet

 logger = setup_logger()

@ -55,11 +56,12 @@ async def trigger_plan(body: PlanTriggerRequest):
    try:
        session.begin()
        logger.info("Getting the inputs")
-        # Read in the trigger file from s3
-        bucket_name = get_settings().PLAN_TRIGGER_BUCKET
        epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
+        plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
+        uprn_filenames = read_dataframe_from_s3_parquet(
+            bucket_name=get_settings().PLAN_TRIGGER_BUCKET, file_key="spatial/filename_meta.parquet"
+        )

-        plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
        input_properties = []
        for config in plan_input:
            # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@ -96,7 +98,7 @@ async def trigger_plan(body: PlanTriggerRequest):
        for p in input_properties:
            p.search_address_epc()
            p.set_year_built()
-            p.get_spatial_data()
+            p.get_spatial_data(uprn_filenames)

        # The materials data could be cached or local so we don't need to make
        # consistent requests to the backend for
@ -110,7 +112,7 @@ async def trigger_plan(body: PlanTriggerRequest):
        materials_by_type = filter_materials(materials)
        cleaned = get_cleaned()

-        logger.info("Getting components and property_change recommendations")
+        logger.info("Getting components and epc recommendations")

        # TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
        #      in as a dependency and then the optimisers can take the input measures in as part of the setup() method
--- a/etl/property_change/DataProcessor.py
+++ b/etl/property_change/DataProcessor.py
@ -2,7 +2,7 @@ from pathlib import Path
 import numpy as np
 import pandas as pd
 from BaseUtility import Definitions
-from etl.property_change.settings import (
+from etl.epc.settings import (
    DATA_PROCESSOR_SETTINGS,
    EARLIEST_EPC_DATE,
    FULLY_GLAZED_DESCRIPTIONS,
@ -20,6 +20,40 @@ from etl.property_change.settings import (

 from typing import List

+# These lookups are used to clean the construction age band
+bounds_map = {
+    "England and Wales: before 1900": {"l": 0, "u": 1899},
+    "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
+    "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
+    "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
+    "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
+    "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
+    "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
+    "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
+    "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
+    "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
+    "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
+    "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
+}
+
+remap = {
+    "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
+}
+
+expanded_map = {
+    i: [
+        label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
+    ][0] for i in range(0, 3001)
+}
+
+
+def is_int(x):
+    try:
+        int(x)
+        return True
+    except:
+        return False
+

 class DataProcessor:
    """
@ -45,66 +79,36 @@ class DataProcessor:
    def insert_data(self, data: pd.DataFrame) -> None:
        self.data = data

+    @staticmethod
+    def clean_construction_age_band(x):
+        # Firstly, we check if it's an error value
+        if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
+            return x
+
+        # Next, we check if it's a value in our map
+        if bounds_map.get(x):
+            return x
+
+        # We check if it's a standard remap value
+        remap_value = remap.get(x, None)
+        if remap_value:
+            return remap_value
+
+        # We check if it's a number
+        if is_int(x):
+            x_int = int(x)
+            return expanded_map[x_int]
+
+        raise NotImplementedError("Not handled the case for value %s" % x)
+
    def standardise_construction_age_band(self):
        """
        This function will tidy up some of the non-standard values that are populated in the construction age
        band, which is useful for cleaning
        """
-        bounds_map = {
-            "England and Wales: before 1900": {"l": 0, "u": 1899},
-            "England and Wales: 1930-1949": {"l": 1930, "u": 1949},
-            "England and Wales: 1900-1929": {"l": 1900, "u": 1929},
-            "England and Wales: 1950-1966": {"l": 1950, "u": 1966},
-            "England and Wales: 1967-1975": {"l": 1967, "u": 1975},
-            "England and Wales: 1976-1982": {"l": 1976, "u": 1982},
-            "England and Wales: 1983-1990": {"l": 1983, "u": 1990},
-            "England and Wales: 1991-1995": {"l": 1991, "u": 1995},
-            "England and Wales: 1996-2002": {"l": 1996, "u": 2002},
-            "England and Wales: 2003-2006": {"l": 2003, "u": 2006},
-            "England and Wales: 2007-2011": {"l": 2007, "u": 2011},
-            "England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
-        }
-
-        remap = {
-            "England and Wales: 2007 onwards": "England and Wales: 2007-2011"
-        }
-
-        expanded_map = {
-            i: [
-                label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
-            ][0] for i in range(0, 3001)
-        }
-
-        def is_int(x):
-            try:
-                int(x)
-                return True
-            except:
-                return False
-
-        def clean_construction_age_band(x):
-            # Firstly, we check if it's an error value
-            if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
-                return x
-
-            # Next, we check if it's a value in our map
-            if bounds_map.get(x):
-                return x
-
-            # We check if it's a standard remap value
-            remap_value = remap.get(x, None)
-            if remap_value:
-                return remap_value
-
-            # We check if it's a number
-            if is_int(x):
-                x_int = int(x)
-                return expanded_map[x_int]
-
-            raise NotImplementedError("Not handled the case for value %s" % x)

        self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
-            lambda x: clean_construction_age_band(x)
+            lambda x: self.clean_construction_age_band(x)
        )

        self.data = self.data[
@ -347,7 +351,7 @@ class DataProcessor:

            cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")

-            # If there still is na values, use average across all property_change in consituecy
+            # If there still is na values, use average across all epc in consituecy
            cleaning_averages_filled[variable] = cleaning_averages_filled[
                variable
            ].fillna(cleaning_averages_filled[variable].mean())
--- a/etl/property_change/FeatureProcessor.py
+++ b/etl/property_change/FeatureProcessor.py
--- a/etl/property_change/init.py
+++ b/etl/property_change/init.py
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@ -4,7 +4,7 @@ from tqdm import tqdm
 import msgpack

 from pathlib import Path
-from etl.property_change.settings import (
+from etl.epc.settings import (
    MANDATORY_FIXED_FEATURES,
    LATEST_FIELD,
    COMPONENT_FEATURES,
@ -14,7 +14,7 @@ from etl.property_change.settings import (
    EARLIEST_EPC_DATE,
    CARBON_RESPONSE,
 )
-from etl.property_change.DataProcessor import DataProcessor
+from etl.epc.DataProcessor import DataProcessor
 from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import (
--- a/etl/property_change/requirements.txt
+++ b/etl/property_change/requirements.txt
--- a/etl/property_change/settings.py
+++ b/etl/property_change/settings.py
--- a/etl/epc_clean/app.py
+++ b/etl/epc_clean/app.py
@ -4,7 +4,7 @@ import pandas as pd
 import msgpack

 from etl.epc_clean.EpcClean import EpcClean
-from etl.property_change.settings import EARLIEST_EPC_DATE
+from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
 from utils.s3 import save_data_to_s3

@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
 def app():
    """
    For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
-    and produce a dataset of cleaned fields so that when we get new property_change, we can quickly
+    and produce a dataset of cleaned fields so that when we get new epc, we can quickly
    sanitise any description data

    Currently, this application is just run on a local machine
--- a/etl/spatial/BoreholeClient.py
+++ b/etl/spatial/BoreholeClient.py
@ -56,7 +56,7 @@ class BoreholeClient:

    # EXAMPLE
    # There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
-    # entries in here if possible before we produce any form of comparison between our property_change, to infer
+    # entries in here if possible before we produce any form of comparison between our epc, to infer
    # the distance from the property to the nearest borehole

    # Let's take a sample
--- a/etl/wall_area/app.py
+++ b/etl/wall_area/app.py
@ -1,5 +1,5 @@
 """
-This script produces the dataset used to model the wall area of property_change, which is used to estimate the cost
+This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
 of insulation measures within homes
 """
 import os
--- a/infrastructure/terraform/main.tf
+++ b/infrastructure/terraform/main.tf
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
  publicly_accessible = true
 }

-# Set up the bucket that recieve the csv uploads of property_change to be retrofit
+# Set up the bucket that recieve the csv uploads of epc to be retrofit
 module "s3_presignable_bucket" {
  source          = "./modules/s3_presignable_bucket"
  bucketname      = "retrofit-plan-inputs-${var.stage}"
--- a/input_property_list.csv
+++ b/input_property_list.csv
@ -7,6 +7,6 @@ Flat 3 Frederick Building,N1 4BD,,,,,
 Flat 4 Frederick Building,N1 4BD,,,,,
 "Flat 28, 22 Adelina Grove",E1 3BX,,,,,
 "Flat 39, 239 Long Lane",SE1 4PT,,,,,
-"1, Westview, Someday",LE14 2QH,This property has an unfilled cavity,,,,
+"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
 "59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
 88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
--- a/recommendations/WallRecommendations.py
+++ b/recommendations/WallRecommendations.py
@ -91,8 +91,8 @@ class WallRecommendations(Definitions):
            if self.property.walls["thermal_transmittance_unit"] != self.U_VALUE_UNIT:
                raise NotImplementedError("Haven't handled the case of other u value units yet")

-            # TODO: It's worth thinking about this logic because depending on when property_change were built,
-            #       they're likely to be of a certain standard. E.g. property_change built within a certain time
+            # TODO: It's worth thinking about this logic because depending on when epc were built,
+            #       they're likely to be of a certain standard. E.g. epc built within a certain time
            #       period are likely to have cavity walls

            # We can't detect it's a cavity wall, but it was built after 1990 so likely built with insulation already
--- a/recommendations/tests/test_wall_recommendations.py
+++ b/recommendations/tests/test_wall_recommendations.py
@ -230,7 +230,7 @@ class TestWallRecommendations:
        The important data for this recommendation is:
        - u value of 0.16
        - property built in 2014
-        Since property_change built after 1990 are typically built with insulation and this property
+        Since epc built after 1990 are typically built with insulation and this property
        already has really good insulation, we do NOT recommend any measures for this property
        """
        input_properties[0].year_built = 2014