mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
implemented property age band cleaning
This commit is contained in:
parent
6cc84e95bf
commit
2b783c8d1a
15 changed files with 92 additions and 85 deletions
|
|
@ -3,6 +3,7 @@ import re
|
|||
import os
|
||||
import pandas as pd
|
||||
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
from epc_api.client import EpcClient
|
||||
|
|
@ -50,6 +51,7 @@ class Property(Definitions):
|
|||
self.uprn = None
|
||||
self.full_sap_epc = None
|
||||
self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
|
||||
self.restricted_measures = False
|
||||
self.year_built = None
|
||||
self.number_of_rooms = None
|
||||
|
||||
|
|
@ -139,7 +141,7 @@ class Property(Definitions):
|
|||
"""
|
||||
|
||||
ventilation = self.data["mechanical-ventilation"]
|
||||
# perform some simple cleaning - when checking 300k property_change, the only unique values were
|
||||
# perform some simple cleaning - when checking 300k epc, the only unique values were
|
||||
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
|
||||
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
|
||||
ventilation = None
|
||||
|
|
@ -157,7 +159,7 @@ class Property(Definitions):
|
|||
- solar_pv
|
||||
This is based on the "photo-supply" field in the EPC data.
|
||||
|
||||
When checking 100k property_change, either the value was "" or a stringified number
|
||||
When checking 100k epc, either the value was "" or a stringified number
|
||||
"""
|
||||
|
||||
solar_pv = self.data["photo-supply"]
|
||||
|
|
@ -287,7 +289,8 @@ class Property(Definitions):
|
|||
if not self.data:
|
||||
raise ValueError("Property does not contain data")
|
||||
|
||||
self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
|
||||
construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
|
||||
self.age_band = england_wales_age_band_lookup.get(construction_age_band)
|
||||
|
||||
def set_spatial(self, spatial: pd.DataFrame):
|
||||
"""
|
||||
|
|
@ -295,8 +298,11 @@ class Property(Definitions):
|
|||
:param spatial: Dataframe, containing the spatial data for the property
|
||||
"""
|
||||
self.in_conservation_area = spatial["conservation_status"].values[0]
|
||||
self.is_listed = spatial["is_listed"].values[0]
|
||||
self.is_heritage = spatial["is_heritage"].values[0]
|
||||
self.is_listed = spatial["is_listed_building"].values[0]
|
||||
self.is_heritage = spatial["is_heritage_building"].values[0]
|
||||
|
||||
if self.in_conservation_area | self.is_listed | self.is_heritage:
|
||||
self.restricted_measures = True
|
||||
|
||||
def set_year_built(self):
|
||||
"""
|
||||
|
|
@ -476,7 +482,7 @@ class Property(Definitions):
|
|||
|
||||
self.floor_area = float(self.data["total-floor-area"])
|
||||
|
||||
def get_spatial_data(self):
|
||||
def get_spatial_data(self, uprn_filenames):
|
||||
|
||||
"""
|
||||
Given a property's UPRN, this method will pull the associated spatial data from s3
|
||||
|
|
@ -486,13 +492,8 @@ class Property(Definitions):
|
|||
if self.uprn is None:
|
||||
raise ValueError("URPN is not set, run search_address_epc")
|
||||
|
||||
# We get the filenames
|
||||
filenames = read_dataframe_from_s3_parquet(
|
||||
bucket_name=DATA_BUCKET, file_key="spatial/filename_meta.parquet"
|
||||
)
|
||||
|
||||
# We get the file name for the uprn
|
||||
filtered_df = filenames[(filenames['lower'] <= self.uprn) & (filenames['upper'] >= self.uprn)]
|
||||
filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
|
||||
if filtered_df.empty:
|
||||
logger.warning("Could not find file containing UPRNS")
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -27,14 +27,15 @@ from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_par
|
|||
|
||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||
from backend.Property import Property
|
||||
from etl.property_change.DataProcessor import DataProcessor
|
||||
from etl.property_change.settings import COLUMNS_TO_MERGE_ON
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from recommendations.FloorRecommendations import FloorRecommendations
|
||||
from recommendations.optimiser.CostOptimiser import CostOptimiser
|
||||
from recommendations.optimiser.GainOptimiser import GainOptimiser
|
||||
from recommendations.optimiser.optimiser_functions import prepare_input_measures
|
||||
from recommendations.WallRecommendations import WallRecommendations
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
|
@ -55,11 +56,12 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
try:
|
||||
session.begin()
|
||||
logger.info("Getting the inputs")
|
||||
# Read in the trigger file from s3
|
||||
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
|
||||
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
|
||||
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
|
||||
uprn_filenames = read_dataframe_from_s3_parquet(
|
||||
bucket_name=get_settings().PLAN_TRIGGER_BUCKET, file_key="spatial/filename_meta.parquet"
|
||||
)
|
||||
|
||||
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
|
||||
input_properties = []
|
||||
for config in plan_input:
|
||||
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
|
||||
|
|
@ -96,7 +98,7 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
for p in input_properties:
|
||||
p.search_address_epc()
|
||||
p.set_year_built()
|
||||
p.get_spatial_data()
|
||||
p.get_spatial_data(uprn_filenames)
|
||||
|
||||
# The materials data could be cached or local so we don't need to make
|
||||
# consistent requests to the backend for
|
||||
|
|
@ -110,7 +112,7 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
materials_by_type = filter_materials(materials)
|
||||
cleaned = get_cleaned()
|
||||
|
||||
logger.info("Getting components and property_change recommendations")
|
||||
logger.info("Getting components and epc recommendations")
|
||||
|
||||
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
|
||||
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from BaseUtility import Definitions
|
||||
from etl.property_change.settings import (
|
||||
from etl.epc.settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
|
|
@ -20,6 +20,40 @@ from etl.property_change.settings import (
|
|||
|
||||
from typing import List
|
||||
|
||||
# These lookups are used to clean the construction age band
|
||||
bounds_map = {
|
||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
}
|
||||
|
||||
remap = {
|
||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||
}
|
||||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
}
|
||||
|
||||
|
||||
def is_int(x):
|
||||
try:
|
||||
int(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
"""
|
||||
|
|
@ -45,66 +79,36 @@ class DataProcessor:
|
|||
def insert_data(self, data: pd.DataFrame) -> None:
|
||||
self.data = data
|
||||
|
||||
@staticmethod
|
||||
def clean_construction_age_band(x):
|
||||
# Firstly, we check if it's an error value
|
||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||
return x
|
||||
|
||||
# Next, we check if it's a value in our map
|
||||
if bounds_map.get(x):
|
||||
return x
|
||||
|
||||
# We check if it's a standard remap value
|
||||
remap_value = remap.get(x, None)
|
||||
if remap_value:
|
||||
return remap_value
|
||||
|
||||
# We check if it's a number
|
||||
if is_int(x):
|
||||
x_int = int(x)
|
||||
return expanded_map[x_int]
|
||||
|
||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||
|
||||
def standardise_construction_age_band(self):
|
||||
"""
|
||||
This function will tidy up some of the non-standard values that are populated in the construction age
|
||||
band, which is useful for cleaning
|
||||
"""
|
||||
bounds_map = {
|
||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
}
|
||||
|
||||
remap = {
|
||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||
}
|
||||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
}
|
||||
|
||||
def is_int(x):
|
||||
try:
|
||||
int(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def clean_construction_age_band(x):
|
||||
# Firstly, we check if it's an error value
|
||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||
return x
|
||||
|
||||
# Next, we check if it's a value in our map
|
||||
if bounds_map.get(x):
|
||||
return x
|
||||
|
||||
# We check if it's a standard remap value
|
||||
remap_value = remap.get(x, None)
|
||||
if remap_value:
|
||||
return remap_value
|
||||
|
||||
# We check if it's a number
|
||||
if is_int(x):
|
||||
x_int = int(x)
|
||||
return expanded_map[x_int]
|
||||
|
||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||
|
||||
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
||||
lambda x: clean_construction_age_band(x)
|
||||
lambda x: self.clean_construction_age_band(x)
|
||||
)
|
||||
|
||||
self.data = self.data[
|
||||
|
|
@ -347,7 +351,7 @@ class DataProcessor:
|
|||
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
||||
|
||||
# If there still is na values, use average across all property_change in consituecy
|
||||
# If there still is na values, use average across all epc in consituecy
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||
variable
|
||||
].fillna(cleaning_averages_filled[variable].mean())
|
||||
|
|
@ -4,7 +4,7 @@ from tqdm import tqdm
|
|||
import msgpack
|
||||
|
||||
from pathlib import Path
|
||||
from etl.property_change.settings import (
|
||||
from etl.epc.settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
|
|
@ -14,7 +14,7 @@ from etl.property_change.settings import (
|
|||
EARLIEST_EPC_DATE,
|
||||
CARBON_RESPONSE,
|
||||
)
|
||||
from etl.property_change.DataProcessor import DataProcessor
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||
from recommendations.recommendation_utils import (
|
||||
|
|
@ -4,7 +4,7 @@ import pandas as pd
|
|||
import msgpack
|
||||
|
||||
from etl.epc_clean.EpcClean import EpcClean
|
||||
from etl.property_change.settings import EARLIEST_EPC_DATE
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
from utils.s3 import save_data_to_s3
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
|||
def app():
|
||||
"""
|
||||
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
||||
and produce a dataset of cleaned fields so that when we get new property_change, we can quickly
|
||||
and produce a dataset of cleaned fields so that when we get new epc, we can quickly
|
||||
sanitise any description data
|
||||
|
||||
Currently, this application is just run on a local machine
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class BoreholeClient:
|
|||
|
||||
# EXAMPLE
|
||||
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
|
||||
# entries in here if possible before we produce any form of comparison between our property_change, to infer
|
||||
# entries in here if possible before we produce any form of comparison between our epc, to infer
|
||||
# the distance from the property to the nearest borehole
|
||||
|
||||
# Let's take a sample
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
This script produces the dataset used to model the wall area of property_change, which is used to estimate the cost
|
||||
This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
|
||||
of insulation measures within homes
|
||||
"""
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
|
|||
publicly_accessible = true
|
||||
}
|
||||
|
||||
# Set up the bucket that recieve the csv uploads of property_change to be retrofit
|
||||
# Set up the bucket that recieve the csv uploads of epc to be retrofit
|
||||
module "s3_presignable_bucket" {
|
||||
source = "./modules/s3_presignable_bucket"
|
||||
bucketname = "retrofit-plan-inputs-${var.stage}"
|
||||
|
|
|
|||
|
|
@ -7,6 +7,6 @@ Flat 3 Frederick Building,N1 4BD,,,,,
|
|||
Flat 4 Frederick Building,N1 4BD,,,,,
|
||||
"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
|
||||
"Flat 39, 239 Long Lane",SE1 4PT,,,,,
|
||||
"1, Westview, Someday",LE14 2QH,This property has an unfilled cavity,,,,
|
||||
"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
|
||||
"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
|
||||
88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
|
||||
|
|
|
@ -91,8 +91,8 @@ class WallRecommendations(Definitions):
|
|||
if self.property.walls["thermal_transmittance_unit"] != self.U_VALUE_UNIT:
|
||||
raise NotImplementedError("Haven't handled the case of other u value units yet")
|
||||
|
||||
# TODO: It's worth thinking about this logic because depending on when property_change were built,
|
||||
# they're likely to be of a certain standard. E.g. property_change built within a certain time
|
||||
# TODO: It's worth thinking about this logic because depending on when epc were built,
|
||||
# they're likely to be of a certain standard. E.g. epc built within a certain time
|
||||
# period are likely to have cavity walls
|
||||
|
||||
# We can't detect it's a cavity wall, but it was built after 1990 so likely built with insulation already
|
||||
|
|
|
|||
|
|
@ -230,7 +230,7 @@ class TestWallRecommendations:
|
|||
The important data for this recommendation is:
|
||||
- u value of 0.16
|
||||
- property built in 2014
|
||||
Since property_change built after 1990 are typically built with insulation and this property
|
||||
Since epc built after 1990 are typically built with insulation and this property
|
||||
already has really good insulation, we do NOT recommend any measures for this property
|
||||
"""
|
||||
input_properties[0].year_built = 2014
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue