implemented property age band cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-05 18:20:52 +01:00
parent 6cc84e95bf
commit 2b783c8d1a
15 changed files with 92 additions and 85 deletions

View file

@ -3,6 +3,7 @@ import re
import os
import pandas as pd
from etl.epc.DataProcessor import DataProcessor
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
from epc_api.client import EpcClient
@ -50,6 +51,7 @@ class Property(Definitions):
self.uprn = None
self.full_sap_epc = None
self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
self.restricted_measures = False
self.year_built = None
self.number_of_rooms = None
@ -139,7 +141,7 @@ class Property(Definitions):
"""
ventilation = self.data["mechanical-ventilation"]
# perform some simple cleaning - when checking 300k property_change, the only unique values were
# perform some simple cleaning - when checking 300k epc, the only unique values were
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
ventilation = None
@ -157,7 +159,7 @@ class Property(Definitions):
- solar_pv
This is based on the "photo-supply" field in the EPC data.
When checking 100k property_change, either the value was "" or a stringified number
When checking 100k epc, either the value was "" or a stringified number
"""
solar_pv = self.data["photo-supply"]
@ -287,7 +289,8 @@ class Property(Definitions):
if not self.data:
raise ValueError("Property does not contain data")
self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
self.age_band = england_wales_age_band_lookup.get(construction_age_band)
def set_spatial(self, spatial: pd.DataFrame):
"""
@ -295,8 +298,11 @@ class Property(Definitions):
:param spatial: Dataframe, containing the spatial data for the property
"""
self.in_conservation_area = spatial["conservation_status"].values[0]
self.is_listed = spatial["is_listed"].values[0]
self.is_heritage = spatial["is_heritage"].values[0]
self.is_listed = spatial["is_listed_building"].values[0]
self.is_heritage = spatial["is_heritage_building"].values[0]
if self.in_conservation_area | self.is_listed | self.is_heritage:
self.restricted_measures = True
def set_year_built(self):
"""
@ -476,7 +482,7 @@ class Property(Definitions):
self.floor_area = float(self.data["total-floor-area"])
def get_spatial_data(self):
def get_spatial_data(self, uprn_filenames):
"""
Given a property's UPRN, this method will pull the associated spatial data from s3
@ -486,13 +492,8 @@ class Property(Definitions):
if self.uprn is None:
raise ValueError("URPN is not set, run search_address_epc")
# We get the filenames
filenames = read_dataframe_from_s3_parquet(
bucket_name=DATA_BUCKET, file_key="spatial/filename_meta.parquet"
)
# We get the file name for the uprn
filtered_df = filenames[(filenames['lower'] <= self.uprn) & (filenames['upper'] >= self.uprn)]
filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
if filtered_df.empty:
logger.warning("Could not find file containing UPRNS")
return None

View file

@ -27,14 +27,15 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_par
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
from backend.Property import Property
from etl.property_change.DataProcessor import DataProcessor
from etl.property_change.settings import COLUMNS_TO_MERGE_ON
from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.optimiser.CostOptimiser import CostOptimiser
from recommendations.optimiser.GainOptimiser import GainOptimiser
from recommendations.optimiser.optimiser_functions import prepare_input_measures
from recommendations.WallRecommendations import WallRecommendations
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
logger = setup_logger()
@ -55,11 +56,12 @@ async def trigger_plan(body: PlanTriggerRequest):
try:
session.begin()
logger.info("Getting the inputs")
# Read in the trigger file from s3
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
uprn_filenames = read_dataframe_from_s3_parquet(
bucket_name=get_settings().PLAN_TRIGGER_BUCKET, file_key="spatial/filename_meta.parquet"
)
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
input_properties = []
for config in plan_input:
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@ -96,7 +98,7 @@ async def trigger_plan(body: PlanTriggerRequest):
for p in input_properties:
p.search_address_epc()
p.set_year_built()
p.get_spatial_data()
p.get_spatial_data(uprn_filenames)
# The materials data could be cached or local so we don't need to make
# consistent requests to the backend for
@ -110,7 +112,7 @@ async def trigger_plan(body: PlanTriggerRequest):
materials_by_type = filter_materials(materials)
cleaned = get_cleaned()
logger.info("Getting components and property_change recommendations")
logger.info("Getting components and epc recommendations")
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method

View file

@ -2,7 +2,7 @@ from pathlib import Path
import numpy as np
import pandas as pd
from BaseUtility import Definitions
from etl.property_change.settings import (
from etl.epc.settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,
@ -20,6 +20,40 @@ from etl.property_change.settings import (
from typing import List
# These lookups are used to clean the construction age band
bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
][0] for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
class DataProcessor:
"""
@ -45,66 +79,36 @@ class DataProcessor:
def insert_data(self, data: pd.DataFrame) -> None:
self.data = data
@staticmethod
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
def standardise_construction_age_band(self):
"""
This function will tidy up some of the non-standard values that are populated in the construction age
band, which is useful for cleaning
"""
bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
][0] for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: clean_construction_age_band(x)
lambda x: self.clean_construction_age_band(x)
)
self.data = self.data[
@ -347,7 +351,7 @@ class DataProcessor:
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
# If there still is na values, use average across all property_change in consituecy
# If there still is na values, use average across all epc in consituecy
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[variable].mean())

View file

@ -4,7 +4,7 @@ from tqdm import tqdm
import msgpack
from pathlib import Path
from etl.property_change.settings import (
from etl.epc.settings import (
MANDATORY_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
@ -14,7 +14,7 @@ from etl.property_change.settings import (
EARLIEST_EPC_DATE,
CARBON_RESPONSE,
)
from etl.property_change.DataProcessor import DataProcessor
from etl.epc.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
from recommendations.rdsap_tables import england_wales_age_band_lookup
from recommendations.recommendation_utils import (

View file

@ -4,7 +4,7 @@ import pandas as pd
import msgpack
from etl.epc_clean.EpcClean import EpcClean
from etl.property_change.settings import EARLIEST_EPC_DATE
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
from utils.s3 import save_data_to_s3
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
def app():
"""
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
and produce a dataset of cleaned fields so that when we get new property_change, we can quickly
and produce a dataset of cleaned fields so that when we get new epc, we can quickly
sanitise any description data
Currently, this application is just run on a local machine

View file

@ -56,7 +56,7 @@ class BoreholeClient:
# EXAMPLE
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
# entries in here if possible before we produce any form of comparison between our property_change, to infer
# entries in here if possible before we produce any form of comparison between our epc, to infer
# the distance from the property to the nearest borehole
# Let's take a sample

View file

@ -1,5 +1,5 @@
"""
This script produces the dataset used to model the wall area of property_change, which is used to estimate the cost
This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
of insulation measures within homes
"""
import os

View file

@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
publicly_accessible = true
}
# Set up the bucket that recieve the csv uploads of property_change to be retrofit
# Set up the bucket that recieve the csv uploads of epc to be retrofit
module "s3_presignable_bucket" {
source = "./modules/s3_presignable_bucket"
bucketname = "retrofit-plan-inputs-${var.stage}"

View file

@ -7,6 +7,6 @@ Flat 3 Frederick Building,N1 4BD,,,,,
Flat 4 Frederick Building,N1 4BD,,,,,
"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
"Flat 39, 239 Long Lane",SE1 4PT,,,,,
"1, Westview, Someday",LE14 2QH,This property has an unfilled cavity,,,,
"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
1 address postcode Notes
7 Flat 4 Frederick Building N1 4BD
8 Flat 28, 22 Adelina Grove E1 3BX
9 Flat 39, 239 Long Lane SE1 4PT
10 1, Westview, Someday 1, Westview, Somerby LE14 2QH This property has an unfilled cavity
11 59, Ashdale CM23 4EB This property has a partially filled cavity
12 88 Cleveland Avenue DL3 7BE This property has a filled cavity

View file

@ -91,8 +91,8 @@ class WallRecommendations(Definitions):
if self.property.walls["thermal_transmittance_unit"] != self.U_VALUE_UNIT:
raise NotImplementedError("Haven't handled the case of other u value units yet")
# TODO: It's worth thinking about this logic because depending on when property_change were built,
# they're likely to be of a certain standard. E.g. property_change built within a certain time
# TODO: It's worth thinking about this logic because depending on when epc were built,
# they're likely to be of a certain standard. E.g. epc built within a certain time
# period are likely to have cavity walls
# We can't detect it's a cavity wall, but it was built after 1990 so likely built with insulation already

View file

@ -230,7 +230,7 @@ class TestWallRecommendations:
The important data for this recommendation is:
- u value of 0.16
- property built in 2014
Since property_change built after 1990 are typically built with insulation and this property
Since epc built after 1990 are typically built with insulation and this property
already has really good insulation, we do NOT recommend any measures for this property
"""
input_properties[0].year_built = 2014