mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
commit
b2142a7f8e
154 changed files with 1977 additions and 13742 deletions
10
.coveragerc
10
.coveragerc
|
|
@ -2,12 +2,8 @@
|
|||
omit =
|
||||
*__init__*
|
||||
*/tests/*
|
||||
model_data/temp_inputs.py
|
||||
model_data/config.py
|
||||
model_data/__init__.py
|
||||
model_data/app.py
|
||||
model_data/plotting/*
|
||||
recommendations/rdsap_tables.py
|
||||
model_data/simulation_system/*
|
||||
model_data/cleaner_app.py
|
||||
*/config.py
|
||||
*/app.py
|
||||
*/settings.py
|
||||
backend/app/*
|
||||
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
|
|
@ -1,81 +0,0 @@
|
|||
name: Sap Model Deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ dev, prod ]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.10.12
|
||||
|
||||
- name: Install Serverless and plugins
|
||||
run: |
|
||||
npm install -g serverless
|
||||
npm install -g serverless-domain-manager
|
||||
|
||||
- name: AWS credentials for dev
|
||||
if: github.ref == 'refs/heads/dev'
|
||||
uses: aws-actions/configure-aws-credentials@v1
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: eu-west-2
|
||||
|
||||
- name: AWS credentials for prod
|
||||
if: github.ref == 'refs/heads/prod'
|
||||
uses: aws-actions/configure-aws-credentials@v1
|
||||
with:
|
||||
aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: eu-west-2
|
||||
|
||||
- name: Set domain name
|
||||
id: set_domain
|
||||
run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
|
||||
|
||||
- name: Set ECR credentials
|
||||
id: set_ecr_credentials
|
||||
run: |
|
||||
echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
|
||||
|
||||
- name: Setup Docker
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Login to ECR
|
||||
run: |
|
||||
aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
||||
|
||||
# Building and pushing Docker image with caching
|
||||
- name: Build and push Docker image
|
||||
uses: docker/build-push-action@v3
|
||||
with:
|
||||
context: ./model_data/simulation_system
|
||||
file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
|
||||
push: true
|
||||
tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
platform: linux/amd64
|
||||
provenance: false
|
||||
|
||||
- name: Deploy to AWS Lambda via Serverless
|
||||
env:
|
||||
RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
|
||||
MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
|
||||
PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
|
||||
DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
|
||||
DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
|
||||
ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
||||
GITHUB_SHA: ${{ github.sha }}
|
||||
run: |
|
||||
# Deploy to AWS Lambda via Serverless
|
||||
sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -239,7 +239,8 @@ fabric.properties
|
|||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
# Locally stored data
|
||||
/model_data/local_data/*
|
||||
local_data/*
|
||||
/local_data/*
|
||||
|
||||
*.DS_Store
|
||||
infrastructure/terraform/.terraform*
|
||||
|
|
@ -261,3 +262,6 @@ model_data/simulation_system/predictions/
|
|||
|
||||
.idea/Model.iml
|
||||
.idea/misc.iml
|
||||
|
||||
adhoc
|
||||
adhoc/*
|
||||
9
.idea/Model.iml
generated
9
.idea/Model.iml
generated
|
|
@ -7,7 +7,14 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
<option name="namespacePackageFolders">
|
||||
<list>
|
||||
<option value="$MODULE_DIR$/local_data" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -43,7 +43,9 @@ class Definitions:
|
|||
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
||||
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
||||
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
||||
"NULL"
|
||||
"NULL",
|
||||
# We sometimes see fields populated with just an empty string.
|
||||
""
|
||||
}
|
||||
|
||||
DATA_ANOMALY_SUBSTRINGS = {
|
||||
|
|
@ -1,9 +1,22 @@
|
|||
from datetime import datetime
|
||||
import re
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
from epc_api.client import EpcClient
|
||||
from model_data.config import EPC_AUTH_TOKEN
|
||||
from model_data.BaseUtility import Definitions
|
||||
from BaseUtility import Definitions
|
||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||
from recommendations.recommendation_utils import estimate_floors, estimate_perimeter, get_wall_type, estimate_wall_area
|
||||
|
||||
ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev')
|
||||
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
|
||||
DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None)
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class Property(Definitions):
|
||||
|
|
@ -30,17 +43,27 @@ class Property(Definitions):
|
|||
lighting = None
|
||||
|
||||
coordinates = None
|
||||
age_band = None
|
||||
|
||||
def __init__(self, id, postcode, address1, epc_client=None, data=None):
|
||||
self.id = id
|
||||
self.postcode = postcode
|
||||
self.address1 = address1
|
||||
self.data = data
|
||||
self.old_data = None
|
||||
self.property_dimensions = None
|
||||
|
||||
self.uprn = None
|
||||
self.full_sap_epc = None
|
||||
self.in_conservation_area = None
|
||||
self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
|
||||
self.restricted_measures = False
|
||||
self.year_built = None
|
||||
self.number_of_rooms = None
|
||||
self.age_band = None
|
||||
self.construction_age_band = None
|
||||
self.number_of_floors = None
|
||||
self.perimeter = None
|
||||
self.wall_type = None
|
||||
self.floor_type = None
|
||||
|
||||
self.energy = None
|
||||
self.ventilation = None
|
||||
|
|
@ -83,9 +106,14 @@ class Property(Definitions):
|
|||
]
|
||||
if len(newest_response) > 1:
|
||||
raise Exception("More than one result found for this address - investigate me")
|
||||
|
||||
# We'll keep old EPCs in case it contains information, not present on the newest one
|
||||
self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]]
|
||||
|
||||
response["rows"] = newest_response
|
||||
|
||||
self.data = response["rows"][0]
|
||||
self.uprn = int(self.data["uprn"])
|
||||
|
||||
def set_coordinates(self, coordinates):
|
||||
"""
|
||||
|
|
@ -127,7 +155,7 @@ class Property(Definitions):
|
|||
"""
|
||||
|
||||
ventilation = self.data["mechanical-ventilation"]
|
||||
# perform some simple cleaning - when checking 300k properties, the only unique values were
|
||||
# perform some simple cleaning - when checking 300k epc, the only unique values were
|
||||
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
|
||||
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
|
||||
ventilation = None
|
||||
|
|
@ -145,7 +173,7 @@ class Property(Definitions):
|
|||
- solar_pv
|
||||
This is based on the "photo-supply" field in the EPC data.
|
||||
|
||||
When checking 100k properties, either the value was "" or a stringified number
|
||||
When checking 100k epc, either the value was "" or a stringified number
|
||||
"""
|
||||
|
||||
solar_pv = self.data["photo-supply"]
|
||||
|
|
@ -244,11 +272,10 @@ class Property(Definitions):
|
|||
self.set_count_variables()
|
||||
self.set_heat_loss_corridor()
|
||||
self.set_mains_gas()
|
||||
self.set_floor_height()
|
||||
self.set_wall_area()
|
||||
self.set_floor_area()
|
||||
self.set_age_band()
|
||||
|
||||
self.set_basic_property_dimensions()
|
||||
|
||||
for description, attribute in cleaned.items():
|
||||
|
||||
if self.data[description] in self.DATA_ANOMALY_MATCHES:
|
||||
|
|
@ -262,10 +289,19 @@ class Property(Definitions):
|
|||
attributes = [
|
||||
x for x in cleaned[description] if x["original_description"] == self.data[description]
|
||||
]
|
||||
if len(attributes) != 1:
|
||||
if len(attributes) > 1:
|
||||
raise ValueError("Either No attributes or multiple found for %s" % description)
|
||||
|
||||
if len(attributes) == 0:
|
||||
# We attempt to perform the clean on the fly
|
||||
cleaner_cls = all_cleaner_map[description]
|
||||
attributes = [cleaner_cls(self.data[description]).process()]
|
||||
|
||||
setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])
|
||||
|
||||
self.set_wall_type()
|
||||
self.set_floor_type()
|
||||
|
||||
def set_age_band(self):
|
||||
"""
|
||||
Sets a cleaned version of the age band of the property given the EPC data
|
||||
|
|
@ -275,14 +311,20 @@ class Property(Definitions):
|
|||
if not self.data:
|
||||
raise ValueError("Property does not contain data")
|
||||
|
||||
self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
|
||||
self.construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
|
||||
self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
|
||||
|
||||
def set_is_in_conservation_area(self, in_conservation_area):
|
||||
def set_spatial(self, spatial: pd.DataFrame):
|
||||
"""
|
||||
Sets whether the property is in a conservation area given the output of the ConservationAreaClient
|
||||
:param in_conservation_area: string value, indicating whether the property is in a conservation area
|
||||
:param spatial: Dataframe, containing the spatial data for the property
|
||||
"""
|
||||
self.in_conservation_area = in_conservation_area
|
||||
self.in_conservation_area = spatial["conservation_status"].values[0]
|
||||
self.is_listed = spatial["is_listed_building"].values[0]
|
||||
self.is_heritage = spatial["is_heritage_building"].values[0]
|
||||
|
||||
if self.in_conservation_area is True | self.is_listed is True | self.is_heritage is True:
|
||||
self.restricted_measures = True
|
||||
|
||||
def set_year_built(self):
|
||||
"""
|
||||
|
|
@ -349,17 +391,6 @@ class Property(Definitions):
|
|||
else:
|
||||
self.mains_gas = map[self.data["mains-gas-flag"]]
|
||||
|
||||
def set_floor_height(self):
|
||||
"""
|
||||
Sets the floor height of the property
|
||||
:return:
|
||||
"""
|
||||
|
||||
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
|
||||
self.floor_height = None
|
||||
else:
|
||||
self.floor_height = float(self.data["floor-height"])
|
||||
|
||||
def _clean_upload_data(self, to_update):
|
||||
for k, v in to_update.items():
|
||||
if v in self.DATA_ANOMALY_MATCHES:
|
||||
|
|
@ -443,21 +474,210 @@ class Property(Definitions):
|
|||
|
||||
return property_details_epc
|
||||
|
||||
def set_wall_area(self):
|
||||
"""
|
||||
This method is placeholder
|
||||
It implements our floor area model to produce an estimate of the property's insulatable wall area
|
||||
"""
|
||||
|
||||
import random
|
||||
self.insulation_wall_area = random.uniform(60, 100)
|
||||
|
||||
def set_floor_area(self):
|
||||
"""
|
||||
Sets the floor area based on the EPC data
|
||||
def get_spatial_data(self, uprn_filenames):
|
||||
|
||||
"""
|
||||
# We don't know the number of floors at the moment so we're going to assume 1
|
||||
# however this is something we'll need to use Verisk data for
|
||||
Given a property's UPRN, this method will pull the associated spatial data from s3
|
||||
:return:
|
||||
"""
|
||||
|
||||
if self.uprn is None:
|
||||
raise ValueError("URPN is not set, run search_address_epc")
|
||||
|
||||
# We get the file name for the uprn
|
||||
filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
|
||||
if filtered_df.empty:
|
||||
logger.warning("Could not find file containing UPRNS")
|
||||
return None
|
||||
|
||||
filename = filtered_df.iloc[0]['filenames']
|
||||
|
||||
spatial_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}"
|
||||
)
|
||||
|
||||
spatial = spatial_data[spatial_data["UPRN"] == self.uprn]
|
||||
|
||||
# Pull out spatial features
|
||||
self.set_spatial(spatial)
|
||||
|
||||
def _filter_property_dimensions(self, property_dimensions):
|
||||
"""
|
||||
Will filter the property dimensions dataframe to only include the relevant rows for the property
|
||||
:param property_dimensions:
|
||||
:return: filtered property dimensions dataframe
|
||||
"""
|
||||
|
||||
result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])]
|
||||
|
||||
if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES:
|
||||
result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)]
|
||||
|
||||
if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]:
|
||||
result = result[(result["BUILT_FORM"] == self.data["built-form"])]
|
||||
|
||||
return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean()
|
||||
|
||||
def set_basic_property_dimensions(self):
|
||||
"""
|
||||
This method sets the number of floors of the property, using a simple approach based on an estimate for
|
||||
average room size, number of rooms and total floor area
|
||||
|
||||
It sets the perimeter of the property, using a simple approach based on an estimate for average room size,
|
||||
number of rooms and total floor area
|
||||
|
||||
Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on
|
||||
medians across the EPC data
|
||||
:return:
|
||||
"""
|
||||
|
||||
self.floor_area = float(self.data["total-floor-area"])
|
||||
|
||||
if not self.data["number-habitable-rooms"] or (
|
||||
self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES
|
||||
):
|
||||
if self.property_dimensions is None:
|
||||
property_dimensions = read_dataframe_from_s3_parquet(
|
||||
bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.data['local-authority']}.parquet"
|
||||
)
|
||||
self.property_dimensions = self._filter_property_dimensions(property_dimensions)
|
||||
|
||||
if not self.data["number-habitable-rooms"]:
|
||||
self.number_of_rooms = float(self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round())
|
||||
else:
|
||||
self.number_of_rooms = float(self.data["number-habitable-rooms"])
|
||||
|
||||
if self.data["property-type"] == "House":
|
||||
self.number_of_floors = estimate_floors(self.floor_area, self.number_of_rooms)
|
||||
elif self.data["property-type"] == "Flat":
|
||||
self.number_of_floors = 1
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
|
||||
self.floor_height = float(self.property_dimensions["FLOOR_HEIGHT"].round(2))
|
||||
else:
|
||||
self.floor_height = float(self.data["floor-height"])
|
||||
|
||||
self.perimeter = estimate_perimeter(
|
||||
self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors
|
||||
)
|
||||
|
||||
self.insulation_wall_area = estimate_wall_area(
|
||||
num_floors=self.number_of_floors, floor_height=self.floor_height, perimeter=self.perimeter
|
||||
)
|
||||
|
||||
def set_wall_type(self):
|
||||
"""
|
||||
This method sets the wall type of the property, using a simple approach based on the wall description
|
||||
:return:
|
||||
"""
|
||||
self.wall_type = get_wall_type(**self.walls)
|
||||
|
||||
def set_floor_type(self):
|
||||
"""
|
||||
This method sets the floor type of the property, which is used for calculating u-values
|
||||
:return:
|
||||
"""
|
||||
self.floor_type = "suspended" if self.floor["is_suspended"] else "solid"
|
||||
|
||||
@staticmethod
|
||||
def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None):
|
||||
for k in component_rename_cols:
|
||||
component_data[f"{rename_prefix}_{k}"] = component_data[k]
|
||||
|
||||
component_data = {
|
||||
k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols
|
||||
}
|
||||
|
||||
return component_data
|
||||
|
||||
def get_model_data(self):
|
||||
"""
|
||||
This method extracts cleaned data from the property object, which is used in our machine learning models
|
||||
|
||||
This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor.
|
||||
|
||||
For future iterations of this, we probably want to implement a singular method in DataProcessor, which can
|
||||
be used in the etl code and in here
|
||||
|
||||
:return: dictionary of model data to be scored in the model
|
||||
"""
|
||||
|
||||
drop_cols = ["original_description", "clean_description"]
|
||||
insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"]
|
||||
insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"]
|
||||
|
||||
walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls")
|
||||
roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof")
|
||||
floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor")
|
||||
|
||||
windows = self._extract_component(self.windows, [], drop_cols + ["no_data"])
|
||||
fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel")
|
||||
main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"])
|
||||
main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols)
|
||||
hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater")
|
||||
|
||||
# We'll need to clean second heating
|
||||
second_heating = self.data["secondheat-description"]
|
||||
|
||||
epc_raw_columns = [
|
||||
'TRANSACTION_TYPE',
|
||||
'ENERGY_TARIFF',
|
||||
'PROPERTY_TYPE',
|
||||
'UPRN',
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'MECHANICAL_VENTILATION',
|
||||
'PHOTO_SUPPLY',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'SOLAR_WATER_HEATING_FLAG',
|
||||
'GLAZED_TYPE',
|
||||
'CONSTITUENCY',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'EXTENSION_COUNT',
|
||||
]
|
||||
epc_raw_data = {
|
||||
k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
|
||||
}
|
||||
|
||||
built_form_cleaning_map = {
|
||||
"Flat": "Mid-Terrace",
|
||||
"House": "Semi-Detached",
|
||||
"Bungalow": "Detached",
|
||||
"Maisonette": "Mid-Terrace"
|
||||
}
|
||||
|
||||
built_form = self.data["built-form"]
|
||||
if built_form in self.DATA_ANOMALY_MATCHES:
|
||||
# TODO: If built form isn't captured, we use the most common value for that property type - we shall
|
||||
# improve this methodology
|
||||
built_form = built_form_cleaning_map.get(self.data["property-type"])
|
||||
if not built_form:
|
||||
raise NotImplementedError("Not handled this property type when cleaning built form")
|
||||
|
||||
property_data = {
|
||||
**walls,
|
||||
**roof,
|
||||
**floor,
|
||||
**fuel,
|
||||
**main_heating,
|
||||
**main_heating_controls,
|
||||
**hotwater,
|
||||
**windows,
|
||||
"SECONDHEAT_DESCRIPTION": second_heating,
|
||||
"DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
|
||||
"SAP": float(self.data["current-energy-efficiency"]),
|
||||
"CARBON": float(self.data["co2-emissions-current"]),
|
||||
"HEAT_DEMAND": float(self.data["energy-consumption-current"]),
|
||||
"estimated_perimeter": self.perimeter,
|
||||
"CONSTRUCTION_AGE_BAND": self.construction_age_band,
|
||||
"FLOOR_HEIGHT": self.floor_height,
|
||||
"NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
|
||||
"TOTAL_FLOOR_AREA": self.floor_area,
|
||||
**epc_raw_data,
|
||||
"BUILT_FORM": built_form,
|
||||
}
|
||||
|
||||
return property_data
|
||||
|
|
|
|||
|
|
@ -1,10 +1,17 @@
|
|||
from backend.app.db.models.materials import Material
|
||||
from functools import lru_cache
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def get_materials(session):
|
||||
"""
|
||||
This function will retrieve all materials from the database.
|
||||
:return: A list of Material objects if successful, an empty list otherwise.
|
||||
|
||||
|
||||
TODO: It might not be the best choice to store the materials data in a database table since thi
|
||||
table probably won't be very large and won't be updated that often. It might be better to
|
||||
store this data in s3 load it into memory when the app starts up. We will test this
|
||||
"""
|
||||
|
||||
materials = session.query(Material).filter(Material.is_active).all()
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ class MaterialType(enum.Enum):
|
|||
solid_floor_insulation = "solid_floor_insulation"
|
||||
external_wall_insulation = "external_wall_insulation"
|
||||
internal_wall_insulation = "internal_wall_insulation"
|
||||
cavity_wall_insulation = "cavity_wall_insulation"
|
||||
|
||||
|
||||
class DepthUnit(enum.Enum):
|
||||
|
|
|
|||
|
|
@ -1,50 +1,41 @@
|
|||
from collections import defaultdict
|
||||
from fastapi import APIRouter, Depends
|
||||
from backend.app.db.models.portfolio import rating_lookup
|
||||
from backend.app.dependencies import validate_token
|
||||
from backend.app.plan.schemas import PlanTriggerRequest
|
||||
from backend.app.utils import read_csv_from_s3
|
||||
from backend.app.config import get_settings
|
||||
from backend.Property import Property
|
||||
from epc_api.client import EpcClient
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_from_s3
|
||||
from recommendations.FloorRecommendations import FloorRecommendations
|
||||
from recommendations.WallRecommendations import WallRecommendations
|
||||
from recommendations.config import UPGRADES_MAP
|
||||
from utils.uvalue_estimates import classify_decile_newvalues
|
||||
from backend.app.db.utils import row2dict
|
||||
from starlette.responses import Response
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import msgpack
|
||||
from epc_api.client import EpcClient
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from starlette.responses import Response
|
||||
|
||||
# model apis
|
||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||
|
||||
# database interaction functions
|
||||
from backend.app.db.functions.property_functions import (
|
||||
create_property, create_property_targets, update_property_data, create_property_details_epc
|
||||
)
|
||||
from backend.app.config import get_settings
|
||||
from backend.app.db.connection import db_engine
|
||||
from backend.app.db.functions.materials_functions import get_materials
|
||||
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||
from backend.app.db.functions.property_functions import (
|
||||
create_property, create_property_details_epc, create_property_targets, update_property_data
|
||||
)
|
||||
from backend.app.db.functions.recommendations_functions import (
|
||||
create_plan, create_plan_recommendations, upload_recommendations
|
||||
)
|
||||
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||
from backend.app.db.connection import db_engine
|
||||
from backend.app.db.models.portfolio import rating_lookup
|
||||
from backend.app.dependencies import validate_token
|
||||
from backend.app.plan.schemas import PlanTriggerRequest
|
||||
from backend.app.plan.utils import (
|
||||
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
|
||||
)
|
||||
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
|
||||
|
||||
from model_data.optimiser.GainOptimiser import GainOptimiser
|
||||
from model_data.optimiser.CostOptimiser import CostOptimiser
|
||||
from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
|
||||
from model_data.optimiser.optimiser_functions import prepare_input_measures
|
||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
||||
from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
|
||||
|
||||
# TODO: This is placeholder until data is stored in DB
|
||||
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
|
||||
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
|
||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||
from backend.Property import Property
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from recommendations.FloorRecommendations import FloorRecommendations
|
||||
from recommendations.optimiser.CostOptimiser import CostOptimiser
|
||||
from recommendations.optimiser.GainOptimiser import GainOptimiser
|
||||
from recommendations.optimiser.optimiser_functions import prepare_input_measures
|
||||
from recommendations.WallRecommendations import WallRecommendations
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
|
@ -55,147 +46,25 @@ router = APIRouter(
|
|||
responses={404: {"description": "Not found"}}
|
||||
)
|
||||
|
||||
# TODO: Load this data from db
|
||||
open_uprn_data = [
|
||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
||||
'LONGITUDE': -0.0540506},
|
||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
||||
'LONGITUDE': -0.0498772},
|
||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
||||
'LONGITUDE': -0.226392},
|
||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
||||
'LONGITUDE': -0.0468833},
|
||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
||||
'LONGITUDE': -0.1362513},
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
in_conservation_area_data = [
|
||||
{'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
|
||||
{'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
|
||||
{'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
|
||||
{'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
|
||||
{'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
|
||||
{'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
|
||||
{'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
|
||||
{'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
|
||||
]
|
||||
|
||||
# TODO: db
|
||||
floors_decile_data = {
|
||||
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
|
||||
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 50., 56., 69., 77.6, 87., 98., 112.,
|
||||
127., 150., 2279.]}
|
||||
|
||||
walls_decile_data = {
|
||||
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
|
||||
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
|
||||
120., 2279.]}
|
||||
|
||||
|
||||
def filter_materials(materials):
|
||||
materials_by_type = defaultdict(list)
|
||||
|
||||
for material in materials:
|
||||
material = row2dict(material)
|
||||
material_type = material["type"]
|
||||
materials_by_type[material_type].append(material)
|
||||
|
||||
# Optionally, you can convert the defaultdict to a normal dict if desired
|
||||
materials_by_type = dict(materials_by_type)
|
||||
|
||||
return materials_by_type
|
||||
|
||||
|
||||
def insert_temp_recommendation_id(property_recommendations):
|
||||
"""
|
||||
Creates a temporary recommendation id which is needed for
|
||||
filtering recommendations between default and no, after the optimiser has been
|
||||
run
|
||||
:param property_recommendations: nested list of recommendations, grouped by data_types
|
||||
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
|
||||
integer inserted
|
||||
"""
|
||||
idx = 0
|
||||
|
||||
for recs in property_recommendations:
|
||||
for rec in recs:
|
||||
rec["recommendation_id"] = idx
|
||||
idx += 1
|
||||
|
||||
return property_recommendations
|
||||
|
||||
|
||||
def get_cleaned():
|
||||
"""
|
||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||
descriptions for the epc dataset
|
||||
|
||||
This data is stored in MessagePack format and therefore needs to be decoded
|
||||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def create_recommendation_scoring_data(
|
||||
property: Property,
|
||||
recommendation: dict,
|
||||
starting_epc_data: pd.DataFrame,
|
||||
ending_epc_data: pd.DataFrame,
|
||||
fixed_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
This wrapper function prepares data to be passed to the sap model api
|
||||
:return:
|
||||
"""
|
||||
|
||||
scoring_dict = {
|
||||
"UPRN": property.data["uprn"],
|
||||
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
|
||||
"LOCAL_AUTHORITY": property.data["local-authority"],
|
||||
**starting_epc_data.to_dict("records")[0],
|
||||
**ending_epc_data.to_dict("records")[0],
|
||||
**fixed_data.to_dict("records")[0]
|
||||
}
|
||||
|
||||
# We update the description to indicate it's insulated
|
||||
if recommendation["type"] == "wall_insulation":
|
||||
scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
|
||||
elif recommendation["type"] == "floor_insulation":
|
||||
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
return scoring_dict
|
||||
|
||||
|
||||
@router.post("/trigger")
|
||||
async def trigger_plan(body: PlanTriggerRequest):
|
||||
logger.info("Connecting to db")
|
||||
session = sessionmaker(bind=db_engine)()
|
||||
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
created_at = datetime.now().isoformat()
|
||||
|
||||
try:
|
||||
session.begin()
|
||||
logger.info("Getting the inputs")
|
||||
# Read in the trigger file from s3
|
||||
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
|
||||
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
|
||||
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
|
||||
uprn_filenames = read_dataframe_from_s3_parquet(
|
||||
bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
|
||||
)
|
||||
cleaning_data = read_parquet_from_s3(
|
||||
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
|
||||
input_properties = []
|
||||
for config in plan_input:
|
||||
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
|
||||
|
|
@ -228,32 +97,21 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
if not input_properties:
|
||||
return Response(status_code=204)
|
||||
|
||||
logger.info("Getting EPC, coordinates and conservation area data")
|
||||
logger.info("Getting EPC, and spatial data")
|
||||
for p in input_properties:
|
||||
p.search_address_epc()
|
||||
p.set_year_built()
|
||||
|
||||
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
|
||||
p.set_coordinates(coordinate_data)
|
||||
|
||||
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
|
||||
"is_in_conservation_area"
|
||||
)
|
||||
p.set_is_in_conservation_area(in_conservation_area)
|
||||
p.get_spatial_data(uprn_filenames)
|
||||
|
||||
# The materials data could be cached or local so we don't need to make
|
||||
# consistent requests to the backend for
|
||||
# the same data
|
||||
# TODO: It might not be the best choice to store the materials data in a database table since thi
|
||||
# table probably won't be very large and won't be updated that often. It might be better to
|
||||
# store this data in s3 load it into memory when the app starts up. We will test this
|
||||
|
||||
logger.info("Reading in materials and cleaned datasets")
|
||||
materials = get_materials(session)
|
||||
materials_by_type = filter_materials(materials)
|
||||
cleaned = get_cleaned()
|
||||
|
||||
logger.info("Getting components and properties recommendations")
|
||||
logger.info("Getting components and epc recommendations")
|
||||
|
||||
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
|
||||
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
|
||||
|
|
@ -263,34 +121,13 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
for p in input_properties:
|
||||
property_recommendations = []
|
||||
|
||||
# For each property, classiy floor area decide
|
||||
total_floor_area_group_decile = classify_decile_newvalues(
|
||||
decile_boundaries=floors_decile_data["decile_boundaries"],
|
||||
decile_labels=floors_decile_data["decile_labels"],
|
||||
new_values=[float(p.data["total-floor-area"])],
|
||||
)[0]
|
||||
|
||||
# Property recommendations
|
||||
p.get_components(cleaned)
|
||||
|
||||
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
|
||||
# database
|
||||
floors_u_value_estimate = [
|
||||
x for x in uvalue_estimates_floors
|
||||
if (x['local-authority'] == p.data["local-authority"]) &
|
||||
(x['property-type'] == p.data["property-type"]) &
|
||||
(x['built-form'] == p.data["built-form"]) &
|
||||
(x['floor-energy-eff'] == p.data["floor-energy-eff"] if p.data[
|
||||
"floor-energy-eff"] != 'N/A' else True) &
|
||||
(x['floor-env-eff'] == p.data["floor-env-eff"] if p.data["floor-env-eff"] != 'N/A' else True)
|
||||
]
|
||||
|
||||
# Floor recommendations
|
||||
floor_recommender = FloorRecommendations(
|
||||
property_instance=p,
|
||||
uvalue_estimates=floors_u_value_estimate,
|
||||
total_floor_area_group_decile=total_floor_area_group_decile,
|
||||
materials=materials_by_type["suspended_floor_insulation"] + materials_by_type["solid_floor_insulation"],
|
||||
materials=materials_by_type["floor"],
|
||||
)
|
||||
floor_recommender.recommend()
|
||||
|
||||
|
|
@ -298,30 +135,10 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
property_recommendations.append(floor_recommender.recommendations)
|
||||
|
||||
# Wall recommendations
|
||||
# We would make this u-value query directly to the database
|
||||
total_floor_area_group_decile = classify_decile_newvalues(
|
||||
decile_boundaries=walls_decile_data["decile_boundaries"],
|
||||
decile_labels=walls_decile_data["decile_labels"],
|
||||
new_values=[float(p.data["total-floor-area"])],
|
||||
)[0]
|
||||
|
||||
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
|
||||
# database
|
||||
walls_u_value_estimate = [
|
||||
x for x in uvalue_estimates_walls
|
||||
if (x['local-authority'] == p.data["local-authority"]) &
|
||||
(x['property-type'] == p.data["property-type"]) &
|
||||
(x['built-form'] == p.data["built-form"]) &
|
||||
(x['walls-energy-eff'] == p.data["walls-energy-eff"] if p.data[
|
||||
"walls-energy-eff"] != 'N/A' else True) &
|
||||
(x['walls-env-eff'] == p.data["walls-env-eff"] if p.data["walls-env-eff"] != 'N/A' else True)
|
||||
]
|
||||
|
||||
wall_recomender = WallRecommendations(
|
||||
property_instance=p,
|
||||
uvalue_estimates=walls_u_value_estimate,
|
||||
total_floor_area_group_decile=total_floor_area_group_decile,
|
||||
materials=materials_by_type["external_wall_insulation"] + materials_by_type["internal_wall_insulation"]
|
||||
materials=materials_by_type["walls"]
|
||||
)
|
||||
wall_recomender.recommend()
|
||||
|
||||
|
|
@ -337,12 +154,8 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
recommendations[p.id] = property_recommendations
|
||||
|
||||
# Finally, we'll prepare data for predicting the impact on SAP
|
||||
# TODO: We should use the cleaned data from get_components in the data rather than the raw
|
||||
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
|
||||
# data
|
||||
|
||||
data_processor = DataProcessor(None, newdata=True)
|
||||
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
|
||||
data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
|
||||
data_processor.pre_process()
|
||||
|
||||
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
||||
|
|
@ -350,10 +163,10 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
fixed_data = data_processor.get_fixed_features()
|
||||
|
||||
# We update the ending record with the recommended updates and we set lodgement date to today
|
||||
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
|
||||
ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
|
||||
|
||||
for recommendations_by_type in property_recommendations:
|
||||
for rec in recommendations_by_type:
|
||||
for i, rec in enumerate(recommendations_by_type):
|
||||
scoring_dict = create_recommendation_scoring_data(
|
||||
property=p,
|
||||
recommendation=rec,
|
||||
|
|
@ -370,15 +183,6 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
logger.info("Preparing data for scoring in sap change api")
|
||||
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
|
||||
|
||||
# Clean the data
|
||||
logger.info("Reading in cleaning dataset from s3")
|
||||
cleaning_data = read_parquet_from_s3(
|
||||
bucket_name=get_settings().DATA_BUCKET,
|
||||
file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
|
||||
|
||||
# Merge the cleaning data onto recommendations_scoring_data
|
||||
|
||||
# Perform the same cleaning as in the model
|
||||
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=recommendations_scoring_data,
|
||||
|
|
@ -386,6 +190,13 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
|
||||
).drop(columns=["LOCAL_AUTHORITY"])
|
||||
|
||||
recommendations_scoring_data = DataProcessor.clean_missings_after_description_process(
|
||||
recommendations_scoring_data, [
|
||||
c for c in recommendations_scoring_data.columns if
|
||||
("thermal_transmittance" in c) or ("insulation_thickness" in c)
|
||||
]
|
||||
)
|
||||
|
||||
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
|
||||
file_location = sap_change_model_api.upload_scoring_data(
|
||||
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
|
||||
|
|
@ -396,14 +207,17 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
|
||||
# Retrieve the predictions
|
||||
predictions = pd.DataFrame(
|
||||
read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
|
||||
read_parquet_from_s3(
|
||||
bucket_name=get_settings().PREDICTIONS_BUCKET,
|
||||
file_key=response["storage_filepath"].split(get_settings().PREDICTIONS_BUCKET + "/")[1]
|
||||
)
|
||||
)
|
||||
|
||||
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
|
||||
predictions["predictions"] = predictions["predictions"].astype(float).round(1)
|
||||
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
|
||||
|
||||
# Insert the predictions into the recommendations and run the optimiser
|
||||
logger.info("Storing recommendations")
|
||||
logger.info("Optimising recommendations")
|
||||
for property_id in recommendations.keys():
|
||||
|
||||
property = [p for p in input_properties if p.id == property_id][0]
|
||||
|
|
@ -411,9 +225,11 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
|
||||
for recommendations_by_type in recommendations[property_id]:
|
||||
for rec in recommendations_by_type:
|
||||
rec["sap_points"] = property_predictions[property_predictions["recommendation_id"] == str(
|
||||
new_sap = property_predictions[property_predictions["recommendation_id"] == str(
|
||||
rec["recommendation_id"]
|
||||
)]["RDSAP_CHANGE"].values[0]
|
||||
)]["predictions"].values[0]
|
||||
|
||||
rec["sap_points"] = new_sap - float(property.data["current-energy-efficiency"])
|
||||
|
||||
if rec["sap_points"] is None:
|
||||
raise ValueError("Sap points missing")
|
||||
|
|
@ -451,8 +267,6 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
final_recommendations = [
|
||||
rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
|
||||
]
|
||||
# We update recommendations[property_id]
|
||||
|
||||
recommendations[property_id] = final_recommendations
|
||||
|
||||
# 1) the property data
|
||||
|
|
|
|||
176
backend/app/plan/temp_script_for_flight.py
Normal file
176
backend/app/plan/temp_script_for_flight.py
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
from epc_api.client import EpcClient
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from starlette.responses import Response
|
||||
|
||||
from backend.app.config import get_settings
|
||||
from backend.app.db.connection import db_engine
|
||||
from backend.app.db.functions.materials_functions import get_materials
|
||||
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||
from backend.app.db.functions.property_functions import (
|
||||
create_property, create_property_details_epc, create_property_targets, update_property_data
|
||||
)
|
||||
from backend.app.db.functions.recommendations_functions import (
|
||||
create_plan, create_plan_recommendations, upload_recommendations
|
||||
)
|
||||
from backend.app.db.models.portfolio import rating_lookup
|
||||
from backend.app.dependencies import validate_token
|
||||
from backend.app.plan.schemas import PlanTriggerRequest
|
||||
from backend.app.plan.utils import (
|
||||
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
|
||||
)
|
||||
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
|
||||
|
||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||
from backend.Property import Property
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from recommendations.FloorRecommendations import FloorRecommendations
|
||||
from recommendations.optimiser.CostOptimiser import CostOptimiser
|
||||
from recommendations.optimiser.GainOptimiser import GainOptimiser
|
||||
from recommendations.optimiser.optimiser_functions import prepare_input_measures
|
||||
from recommendations.WallRecommendations import WallRecommendations
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
import pickle
|
||||
|
||||
with open('local_data.pickle', 'rb') as f:
|
||||
local_data = pickle.load(f)
|
||||
|
||||
with open("property_dimensions.pickle", "rb") as f:
|
||||
property_dimensions = pickle.load(f)
|
||||
|
||||
with open("sap_change_dataset.pickle", "rb") as f:
|
||||
sap_change_dataset = pickle.load(f)
|
||||
|
||||
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
|
||||
plan_input = local_data["plan_input"]
|
||||
uprn_filenames = local_data["uprn_filenames"]
|
||||
local_property_data = local_data["local_property_data"]
|
||||
materials = local_data["materials"]
|
||||
materials_by_type = filter_materials(materials)
|
||||
cleaned = local_data["cleaned"]
|
||||
cleaning_data = local_data["cleaning_data"]
|
||||
|
||||
# Need to find some proper materials
|
||||
materials_by_type["walls"] += [
|
||||
{'id': 4, 'type': 'cavity_wall_insulation', 'description': 'Example Material 1',
|
||||
'depths': None,
|
||||
'depth_unit': None, 'cost': 20,
|
||||
'cost_unit': 'gbp_sq_meter', 'r_value_per_mm': 0.0278, 'r_value_unit': 'square_meter_kelvin_per_watt',
|
||||
'thermal_conductivity': 0.036, 'thermal_conductivity_unit': 'watt_per_meter_kelvin',
|
||||
'link': None, 'created_at': None, 'is_active': True},
|
||||
{'id': 10, 'type': "cavity_wall_insulation", 'description': 'Example Material 2',
|
||||
'depths': None, 'depth_unit': None, 'cost': 25, 'cost_unit': 'gbp_sq_meter',
|
||||
'r_value_per_mm': 0.02631579, 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': 0.038,
|
||||
'thermal_conductivity_unit': 'watt_per_meter_kelvin',
|
||||
'link': None,
|
||||
'created_at': None, 'is_active': True}
|
||||
]
|
||||
|
||||
epc_client = EpcClient(auth_token="NO-TOKEN")
|
||||
|
||||
input_properties = []
|
||||
for i, config in enumerate(plan_input):
|
||||
property_id = local_property_data[i]["id"]
|
||||
input_properties.append(
|
||||
Property(
|
||||
postcode=config['postcode'],
|
||||
address1=config['address'],
|
||||
epc_client=epc_client,
|
||||
id=property_id
|
||||
)
|
||||
)
|
||||
|
||||
logger.info("Getting EPC, and spatial data")
|
||||
for i, p in enumerate(input_properties):
|
||||
p.data = local_property_data[i]["data"]
|
||||
p.uprn = local_property_data[i]["uprn"]
|
||||
p.id = local_property_data[i]["id"]
|
||||
p.full_sap_epc = local_property_data[i]["full_sap_epc"]
|
||||
p.old_data = local_property_data[i]["old_data"]
|
||||
p.is_listed = False
|
||||
p.in_conservation_area = False
|
||||
p.is_heritage = False
|
||||
|
||||
p.set_year_built()
|
||||
|
||||
# TODO: TESTING
|
||||
p.data['number-habitable-rooms'] = 3
|
||||
|
||||
recommendations = {}
|
||||
recommendations_scoring_data = []
|
||||
|
||||
for p in input_properties:
|
||||
property_recommendations = []
|
||||
|
||||
# Property recommendations
|
||||
p.get_components(cleaned)
|
||||
|
||||
# Floor recommendations
|
||||
floor_recommender = FloorRecommendations(
|
||||
property_instance=p,
|
||||
materials=materials_by_type["floor"],
|
||||
)
|
||||
floor_recommender.recommend()
|
||||
|
||||
if floor_recommender.recommendations:
|
||||
property_recommendations.append(floor_recommender.recommendations)
|
||||
|
||||
# Wall recommendations
|
||||
|
||||
wall_recomender = WallRecommendations(
|
||||
property_instance=p,
|
||||
materials=materials_by_type["walls"]
|
||||
)
|
||||
wall_recomender.recommend()
|
||||
|
||||
if wall_recomender.recommendations:
|
||||
property_recommendations.append(wall_recomender.recommendations)
|
||||
|
||||
# We insert temporary ids into the recommendations which is important for the optimiser later
|
||||
property_recommendations = insert_temp_recommendation_id(property_recommendations)
|
||||
|
||||
if not property_recommendations:
|
||||
continue
|
||||
|
||||
recommendations[p.id] = property_recommendations
|
||||
|
||||
# Finally, we'll prepare data for predicting the impact on SAP
|
||||
# TODO: We should use the cleaned data from get_components in the data rather than the raw
|
||||
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
|
||||
# data
|
||||
|
||||
data_processor = DataProcessor(None, newdata=True)
|
||||
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
|
||||
data_processor.pre_process()
|
||||
|
||||
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
||||
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
|
||||
fixed_data = data_processor.get_fixed_features()
|
||||
|
||||
# We update the ending record with the recommended updates and we set lodgement date to today
|
||||
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
|
||||
|
||||
for recommendations_by_type in property_recommendations:
|
||||
for rec in recommendations_by_type:
|
||||
scoring_dict = create_recommendation_scoring_data(
|
||||
property=p,
|
||||
recommendation=rec,
|
||||
starting_epc_data=starting_epc_data,
|
||||
ending_epc_data=ending_epc_data,
|
||||
fixed_data=fixed_data,
|
||||
)
|
||||
|
||||
recommendations_scoring_data.append(scoring_dict)
|
||||
|
||||
# cleanup
|
||||
del data_processor
|
||||
187
backend/app/plan/utils.py
Normal file
187
backend/app/plan/utils.py
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
import pandas as pd
|
||||
from backend.Property import Property
|
||||
from collections import defaultdict
|
||||
from utils.s3 import read_from_s3
|
||||
|
||||
from recommendations.config import UPGRADES_MAP
|
||||
from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
|
||||
|
||||
from backend.app.db.utils import row2dict
|
||||
from backend.app.config import get_settings
|
||||
import msgpack
|
||||
|
||||
|
||||
def filter_materials(materials):
|
||||
materials_by_type = defaultdict(list)
|
||||
|
||||
mapping = {
|
||||
"walls": ["internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"],
|
||||
"floor": ["suspended_floor_insulation", "solid_floor_insulation"]
|
||||
}
|
||||
|
||||
materials = [row2dict(material) for material in materials]
|
||||
|
||||
for component, types in mapping.items():
|
||||
materials_by_type[component] = [part for part in materials if part["type"] in types]
|
||||
|
||||
return dict(materials_by_type)
|
||||
|
||||
|
||||
def insert_temp_recommendation_id(property_recommendations):
|
||||
"""
|
||||
Creates a temporary recommendation id which is needed for
|
||||
filtering recommendations between default and no, after the optimiser has been
|
||||
run
|
||||
:param property_recommendations: nested list of recommendations, grouped by data_types
|
||||
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
|
||||
integer inserted
|
||||
"""
|
||||
idx = 0
|
||||
|
||||
for recs in property_recommendations:
|
||||
for rec in recs:
|
||||
rec["recommendation_id"] = idx
|
||||
idx += 1
|
||||
|
||||
return property_recommendations
|
||||
|
||||
|
||||
def get_cleaned():
|
||||
"""
|
||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||
descriptions for the epc dataset
|
||||
|
||||
This data is stored in MessagePack format and therefore needs to be decoded
|
||||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def create_recommendation_scoring_data(
|
||||
property: Property,
|
||||
recommendation: dict,
|
||||
starting_epc_data: pd.DataFrame,
|
||||
ending_epc_data: pd.DataFrame,
|
||||
fixed_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
This wrapper function prepares data to be passed to the sap model api
|
||||
:return:
|
||||
"""
|
||||
|
||||
scoring_dict = {
|
||||
"UPRN": property.data["uprn"],
|
||||
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
|
||||
"LOCAL_AUTHORITY": property.data["local-authority"],
|
||||
**starting_epc_data.to_dict("records")[0],
|
||||
**ending_epc_data.to_dict("records")[0],
|
||||
**fixed_data.to_dict("records")[0]
|
||||
}
|
||||
|
||||
# Set staring u-values if we don't have them
|
||||
if not scoring_dict["walls_thermal_transmittance"]:
|
||||
scoring_dict["walls_thermal_transmittance"] = get_wall_u_value(
|
||||
clean_description=property.walls["clean_description"],
|
||||
age_band=property.age_band,
|
||||
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
|
||||
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
|
||||
)
|
||||
|
||||
if not scoring_dict["floor_thermal_transmittance"]:
|
||||
scoring_dict["floor_thermal_transmittance"] = get_floor_u_value(
|
||||
floor_type=property.floor_type,
|
||||
area=property.floor_area,
|
||||
perimeter=property.perimeter,
|
||||
wall_type=property.wall_type,
|
||||
insulation_thickness=property.floor["insulation_thickness"],
|
||||
age_band=property.age_band,
|
||||
)
|
||||
|
||||
if not scoring_dict["roof_thermal_transmittance"]:
|
||||
scoring_dict["roof_thermal_transmittance"] = get_roof_u_value(
|
||||
insulation_thickness=property.roof["insulation_thickness"],
|
||||
has_dwelling_above=property.roof["has_dwelling_above"],
|
||||
is_loft=property.roof["is_loft"],
|
||||
is_roof_room=property.roof["is_roof_room"],
|
||||
is_thatched=property.roof["is_thatched"],
|
||||
age_band=property.age_band,
|
||||
is_flat=property.roof["is_flat"],
|
||||
is_pitched=property.roof["is_pitched"],
|
||||
is_at_rafters=property.roof["is_at_rafters"],
|
||||
)
|
||||
|
||||
for col in [
|
||||
"walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness"
|
||||
]:
|
||||
if scoring_dict[col] is None:
|
||||
scoring_dict[col] = "none"
|
||||
|
||||
# We update the description to indicate it's insulated
|
||||
if recommendation["type"] == "wall_insulation":
|
||||
# The upgrade made here is to the u-value of the walls and the description of the
|
||||
# insulation thickness
|
||||
scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
|
||||
scoring_dict["walls_insulation_thickness_ENDING"] = "above average"
|
||||
else:
|
||||
if not scoring_dict["walls_thermal_transmittance_ENDING"]:
|
||||
scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value(
|
||||
clean_description=property.walls["clean_description"],
|
||||
age_band=property.age_band,
|
||||
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
|
||||
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
|
||||
)
|
||||
|
||||
if scoring_dict["walls_insulation_thickness_ENDING"] is None:
|
||||
scoring_dict["walls_insulation_thickness_ENDING"] = "none"
|
||||
|
||||
# Update description to indicate it's insulate
|
||||
if recommendation["type"] == "floor_insulation":
|
||||
|
||||
if len(recommendation["parts"]) > 1:
|
||||
raise NotImplementedError("Have more than 1 floor insulation part - handle this case")
|
||||
|
||||
scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
|
||||
# We don't really see above average for this in the training data
|
||||
scoring_dict["floor_insulation_thickness_ENDING"] = "average"
|
||||
else:
|
||||
if not scoring_dict["floor_thermal_transmittance_ENDING"]:
|
||||
scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(
|
||||
floor_type=property.floor_type,
|
||||
area=property.floor_area,
|
||||
perimeter=property.perimeter,
|
||||
wall_type=property.wall_type,
|
||||
insulation_thickness=property.floor["insulation_thickness"],
|
||||
age_band=property.age_band,
|
||||
)
|
||||
|
||||
if scoring_dict["floor_insulation_thickness_ENDING"] is None:
|
||||
scoring_dict["floor_insulation_thickness_ENDING"] = "none"
|
||||
|
||||
if recommendation["type"] not in ["wall_insulation", "floor_insulation"]:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
if not scoring_dict["roof_thermal_transmittance_ENDING"]:
|
||||
scoring_dict["roof_thermal_transmittance_ENDING"] = get_roof_u_value(
|
||||
insulation_thickness=property.roof["insulation_thickness"],
|
||||
has_dwelling_above=property.roof["has_dwelling_above"],
|
||||
is_loft=property.roof["is_loft"],
|
||||
is_roof_room=property.roof["is_roof_room"],
|
||||
is_thatched=property.roof["is_thatched"],
|
||||
age_band=property.age_band,
|
||||
is_flat=property.roof["is_flat"],
|
||||
is_pitched=property.roof["is_pitched"],
|
||||
is_at_rafters=property.roof["is_at_rafters"],
|
||||
)
|
||||
|
||||
if scoring_dict["roof_insulation_thickness_ENDING"] is None:
|
||||
scoring_dict["roof_insulation_thickness_ENDING"] = "none"
|
||||
|
||||
return scoring_dict
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
|
|||
if sap_points <= 0 or sap_points > 100:
|
||||
raise ValueError("SAP points should be between 1 and 100.")
|
||||
|
||||
if sap_points > 91:
|
||||
if sap_points >= 92:
|
||||
return "A"
|
||||
elif sap_points > 80:
|
||||
elif sap_points >= 81:
|
||||
return "B"
|
||||
elif sap_points > 69:
|
||||
elif sap_points >= 69:
|
||||
return "C"
|
||||
elif sap_points > 55:
|
||||
elif sap_points >= 55:
|
||||
return "D"
|
||||
elif sap_points > 39:
|
||||
elif sap_points >= 39:
|
||||
return "E"
|
||||
elif sap_points > 21:
|
||||
elif sap_points >= 21:
|
||||
return "F"
|
||||
else:
|
||||
return "G"
|
||||
|
|
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
|
|||
elif epc == "B":
|
||||
return 81
|
||||
elif epc == "C":
|
||||
return 70
|
||||
return 69
|
||||
elif epc == "D":
|
||||
return 56
|
||||
return 55
|
||||
elif epc == "E":
|
||||
return 40
|
||||
return 39
|
||||
elif epc == "F":
|
||||
return 22
|
||||
return 21
|
||||
elif epc == "G":
|
||||
return 1
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -62,14 +62,14 @@ class SAPChangeModelAPI:
|
|||
logger.info("Making request to sap change api")
|
||||
url = f"{self.base_url}/sapmodel/predict"
|
||||
payload = {
|
||||
"file_location": f"s3://retrofit-data-dev/{file_location}",
|
||||
"file_location": file_location,
|
||||
"property_id": "", # This should get removed
|
||||
"portfolio_id": self.portfolio_id,
|
||||
"created_at": self.timestamp
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
|
||||
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=120)
|
||||
|
||||
# Check if the response status code is 2xx (success)
|
||||
response.raise_for_status()
|
||||
|
|
|
|||
|
|
@ -34,4 +34,5 @@ pytz==2023.3
|
|||
mip==1.15.0
|
||||
boto3==1.28.3
|
||||
pandas==1.5.3
|
||||
pyarrow==12.0.1
|
||||
pyarrow==12.0.1
|
||||
textblob
|
||||
|
|
@ -1,15 +1,17 @@
|
|||
import pytest
|
||||
import pandas as pd
|
||||
from unittest.mock import Mock
|
||||
from epc_api.client import EpcClient
|
||||
from backend.Property import Property
|
||||
from open_uprn.OpenUprnClient import OpenUprnClient
|
||||
from model_data.EpcClean import EpcClean
|
||||
from etl.epc_clean.EpcClean import EpcClean
|
||||
|
||||
# Define some test data
|
||||
mock_epc_response = {
|
||||
"rows": [
|
||||
{
|
||||
"lmk-key": 1,
|
||||
"uprn": 1,
|
||||
"number-habitable-rooms": 5,
|
||||
"property-type": "House",
|
||||
"inspection-date": "2023-06-01",
|
||||
"some-other-key": "some-value",
|
||||
"roof-description": "Roof Description",
|
||||
|
|
@ -34,6 +36,10 @@ mock_epc_response = {
|
|||
"construction-age-band": "England and Wales: 1967-1975"
|
||||
},
|
||||
{
|
||||
"lmk-key": 2,
|
||||
"uprn": 2,
|
||||
"number-habitable-rooms": 5,
|
||||
"property-type": "House",
|
||||
"inspection-date": "2023-05-01",
|
||||
"some-other-key": "some-other-value",
|
||||
"roof-description": "Roof Description",
|
||||
|
|
@ -63,6 +69,10 @@ mock_epc_response = {
|
|||
mock_epc_response_dupe = {
|
||||
'rows': [
|
||||
{
|
||||
"lmk-key": 1,
|
||||
"uprn": 1,
|
||||
"number-habitable-rooms": 5,
|
||||
"property-type": "House",
|
||||
'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
|
||||
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
||||
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
|
||||
|
|
@ -83,6 +93,10 @@ mock_epc_response_dupe = {
|
|||
"construction-age-band": "England and Wales: 1967-1975"
|
||||
},
|
||||
{
|
||||
"lmk-key": 2,
|
||||
"uprn": 2,
|
||||
"number-habitable-rooms": 5,
|
||||
"property-type": "House",
|
||||
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
|
||||
'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
|
||||
'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
|
||||
|
|
@ -104,6 +118,10 @@ mock_epc_response_dupe = {
|
|||
"construction-age-band": "England and Wales: 1967-1975"
|
||||
},
|
||||
{
|
||||
"lmk-key": 3,
|
||||
"uprn": 3,
|
||||
"number-habitable-rooms": 5,
|
||||
"property-type": "House",
|
||||
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
|
||||
'roof-description': 'Roof Description',
|
||||
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
||||
|
|
@ -130,7 +148,7 @@ mock_epc_response_dupe = {
|
|||
|
||||
class TestProperty:
|
||||
@pytest.fixture(autouse=True)
|
||||
def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
|
||||
def property_instance(self, mock_epc_client, mock_cleaner):
|
||||
property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
return property_instance
|
||||
|
||||
|
|
@ -141,29 +159,18 @@ class TestProperty:
|
|||
|
||||
@pytest.fixture
|
||||
def mock_epc_client(self):
|
||||
mock_epc_client = Mock(spec=EpcClient())
|
||||
mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
|
||||
mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
|
||||
mock_epc_client.auth_token = "mocked_auth_token"
|
||||
return mock_epc_client
|
||||
|
||||
@pytest.fixture
|
||||
def mock_epc_client_dupe_data(self):
|
||||
mock_epc_client_dupe_data = Mock(spec=EpcClient())
|
||||
mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
|
||||
mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
|
||||
mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
|
||||
return mock_epc_client_dupe_data
|
||||
|
||||
@pytest.fixture
|
||||
def mock_open_uprn_client(self):
|
||||
mock_open_uprn_client = Mock(spec=OpenUprnClient(path=None, uprns=[12345]))
|
||||
mock_open_uprn_client.data = pd.DataFrame(
|
||||
[
|
||||
{"UPRN": 12345, "longitude": 1.2345, "latitude": 2.3456},
|
||||
{"UPRN": 12346, "longitude": 3.4567, "latitude": 4.5678}
|
||||
]
|
||||
)
|
||||
return mock_open_uprn_client
|
||||
|
||||
@pytest.fixture
|
||||
def mock_cleaner(self):
|
||||
lighting_averages = [
|
||||
|
|
@ -186,9 +193,22 @@ class TestProperty:
|
|||
)
|
||||
|
||||
mock_cleaner = Mock(spec=cleaner_spec)
|
||||
|
||||
walls_data = {
|
||||
"original_description": "Walls Description",
|
||||
"is_cavity_wall": True,
|
||||
"is_solid_brick": False,
|
||||
"is_timber_frame": False,
|
||||
"is_system_built": False,
|
||||
"is_park_home": False,
|
||||
"is_cob": False,
|
||||
"is_sandstone_or_limestone": False,
|
||||
"is_granite_or_whinstone": False,
|
||||
}
|
||||
|
||||
mock_cleaner.cleaned = {
|
||||
"roof-description": [{"original_description": "Roof Description"}],
|
||||
"walls-description": [{"original_description": "Walls Description"}],
|
||||
"walls-description": [walls_data],
|
||||
"windows-description": [{"original_description": "Windows Description"}],
|
||||
"mainheat-description": [{"original_description": "Main Heating Description"}],
|
||||
"hotwater-description": [{"original_description": "Hot Water Description"}],
|
||||
|
|
@ -201,10 +221,10 @@ class TestProperty:
|
|||
# Should be mocked auth token
|
||||
assert inst1.epc_client.auth_token == "mocked_auth_token"
|
||||
|
||||
inst2 = Property(3, "AB12CD", "Test Address")
|
||||
inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
assert inst2.epc_client.auth_token
|
||||
|
||||
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
|
||||
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client)
|
||||
assert inst3.data == {"some": "data"}
|
||||
|
||||
data = inst3.search_address_epc()
|
||||
|
|
@ -227,11 +247,23 @@ class TestProperty:
|
|||
|
||||
# Verify that the components are set correctly
|
||||
assert property_instance.roof == {"original_description": "Roof Description"}
|
||||
assert property_instance.walls == {"original_description": "Walls Description"}
|
||||
assert property_instance.walls == {
|
||||
"original_description": "Walls Description",
|
||||
"is_cavity_wall": True,
|
||||
"is_solid_brick": False,
|
||||
"is_timber_frame": False,
|
||||
"is_system_built": False,
|
||||
"is_park_home": False,
|
||||
"is_cob": False,
|
||||
"is_sandstone_or_limestone": False,
|
||||
"is_granite_or_whinstone": False,
|
||||
}
|
||||
assert property_instance.windows == {"original_description": "Windows Description"}
|
||||
assert property_instance.main_heating == {"original_description": "Main Heating Description"}
|
||||
assert property_instance.hotwater == {"original_description": "Hot Water Description"}
|
||||
|
||||
assert property_instance.wall_type == "cavity"
|
||||
|
||||
def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
|
||||
# Modify the mock EpcClean to not have cleaned data
|
||||
mock_cleaner.cleaned = {}
|
||||
|
|
|
|||
|
|
@ -1,51 +0,0 @@
|
|||
"""
|
||||
This application reads in the open uprn data from a static location and loads it into
|
||||
our database for querying from other services
|
||||
"""
|
||||
|
||||
import os
|
||||
from conservation_areas.ConservationAreaClient import ConservationAreaClient
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
|
||||
def app():
|
||||
conservation_area_client = ConservationAreaClient(
|
||||
historic_england_path=os.path.abspath(
|
||||
os.path.dirname(__file__)
|
||||
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
|
||||
gov_path=os.path.abspath(
|
||||
os.path.dirname(__file__)
|
||||
) + "/model_data/local_data/gov-conservation-area.geojson"
|
||||
)
|
||||
conservation_area_client.read()
|
||||
|
||||
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
||||
open_uprn_data = [
|
||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
||||
'LONGITUDE': -0.0540506},
|
||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
||||
'LONGITUDE': -0.0498772},
|
||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
||||
'LONGITUDE': -0.226392},
|
||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
||||
'LONGITUDE': -0.0792445},
|
||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
||||
'LONGITUDE': -0.0468833},
|
||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
||||
'LONGITUDE': -0.1362513},
|
||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
||||
'LONGITUDE': -0.0823165}
|
||||
]
|
||||
|
||||
result = [
|
||||
{
|
||||
"uprn": coordinates["UPRN"],
|
||||
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
||||
OpenUprnCoordinateData(**coordinates))
|
||||
} for coordinates in
|
||||
open_uprn_data
|
||||
]
|
||||
|
||||
# TODO: Add a method to write to the database
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
# Data Collection
|
||||
|
||||
This service is specifically focused on the collection of data external sources which aren't easily
|
||||
accessed via api or via downloadable data sources. For example, wages data requires a specific application to
|
||||
pull that data from websites, e.g. from Adzuna's api
|
||||
|
|
@ -1,86 +0,0 @@
|
|||
import requests
|
||||
import json
|
||||
from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
|
||||
"""
|
||||
Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
|
||||
https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
|
||||
-codes-in-the-united-kingdom/explore
|
||||
"""
|
||||
|
||||
constituencies = pd.read_csv(
|
||||
os.path.abspath(
|
||||
os.path.dirname(
|
||||
__file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
|
||||
"December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
|
||||
)
|
||||
|
||||
constituencies["location_type"] = "constituency"
|
||||
|
||||
|
||||
def retry_api_call(job_title, location, max_retries=10):
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
response = get_adzuna_jobs(job_title, location)
|
||||
return response
|
||||
except (requests.HTTPError, requests.ConnectionError):
|
||||
print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
|
||||
time.sleep(2)
|
||||
print(f"Failed after {max_retries} attempts.")
|
||||
return None
|
||||
|
||||
|
||||
def get_adzuna_jobs(job_title, location):
|
||||
base_url = "https://api.adzuna.com/v1/api/jobs"
|
||||
country_code = "gb"
|
||||
|
||||
url = f"{base_url}/{country_code}/search/1"
|
||||
|
||||
params = {
|
||||
"app_id": ADZUNA_APP_ID,
|
||||
"app_key": ADZUNA_API_KEY,
|
||||
"results_per_page": 25,
|
||||
"what": job_title,
|
||||
"where": location,
|
||||
"content-type": "application/json",
|
||||
"distance": 10
|
||||
}
|
||||
|
||||
response = requests.get(url, params=params)
|
||||
response.raise_for_status()
|
||||
|
||||
jobs = json.loads(response.text)
|
||||
return jobs
|
||||
|
||||
|
||||
JOB_TITLES = [
|
||||
"insulation installer", "internal wall insulation installer", "external wall insulation installer",
|
||||
"cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
|
||||
"spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
|
||||
"iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
|
||||
]
|
||||
|
||||
results = []
|
||||
for i, job_title in enumerate(JOB_TITLES):
|
||||
print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
|
||||
for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
|
||||
|
||||
location = location_config["PCON22NM"]
|
||||
jobs = retry_api_call(job_title, location)
|
||||
time.sleep(0.5)
|
||||
if jobs["results"]:
|
||||
for job in jobs['results']:
|
||||
to_append = {
|
||||
"job_title": job_title,
|
||||
"search_location": location,
|
||||
"search_location_code": location_config["PCON22CD"],
|
||||
**job
|
||||
}
|
||||
results.append(to_append)
|
||||
|
||||
results_df = pd.DataFrame(results)
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path='data_collection/.env')
|
||||
|
||||
ADZUNA_API_KEY = os.environ.get('ADZUNA_API_KEY')
|
||||
ADZUNA_APP_ID = os.environ.get('ADZUNA_APP_ID')
|
||||
BIN
data_collection/data/.DS_Store
vendored
BIN
data_collection/data/.DS_Store
vendored
Binary file not shown.
|
|
@ -1,651 +0,0 @@
|
|||
PCON22CD,PCON22NM,ObjectId
|
||||
E14000530,Aldershot,1
|
||||
E14000531,Aldridge-Brownhills,2
|
||||
E14000532,Altrincham and Sale West,3
|
||||
E14000533,Amber Valley,4
|
||||
E14000534,Arundel and South Downs,5
|
||||
E14000535,Ashfield,6
|
||||
E14000536,Ashford,7
|
||||
E14000537,Ashton-under-Lyne,8
|
||||
E14000538,Aylesbury,9
|
||||
E14000539,Banbury,10
|
||||
E14000540,Barking,11
|
||||
E14000541,Barnsley Central,12
|
||||
E14000542,Barnsley East,13
|
||||
E14000543,Barrow and Furness,14
|
||||
E14000544,Basildon and Billericay,15
|
||||
E14000545,Basingstoke,16
|
||||
E14000546,Bassetlaw,17
|
||||
E14000547,Bath,18
|
||||
E14000548,Batley and Spen,19
|
||||
E14000549,Battersea,20
|
||||
E14000550,Beaconsfield,21
|
||||
E14000551,Beckenham,22
|
||||
E14000552,Bedford,23
|
||||
E14000553,Bermondsey and Old Southwark,24
|
||||
E14000554,Berwick-upon-Tweed,25
|
||||
E14000555,Bethnal Green and Bow,26
|
||||
E14000556,Beverley and Holderness,27
|
||||
E14000557,Bexhill and Battle,28
|
||||
E14000558,Bexleyheath and Crayford,29
|
||||
E14000559,Birkenhead,30
|
||||
E14000560,"Birmingham, Edgbaston",31
|
||||
E14000561,"Birmingham, Erdington",32
|
||||
E14000562,"Birmingham, Hall Green",33
|
||||
E14000563,"Birmingham, Hodge Hill",34
|
||||
E14000564,"Birmingham, Ladywood",35
|
||||
E14000565,"Birmingham, Northfield",36
|
||||
E14000566,"Birmingham, Perry Barr",37
|
||||
E14000567,"Birmingham, Selly Oak",38
|
||||
E14000568,"Birmingham, Yardley",39
|
||||
E14000569,Bishop Auckland,40
|
||||
E14000570,Blackburn,41
|
||||
E14000571,Blackley and Broughton,42
|
||||
E14000572,Blackpool North and Cleveleys,43
|
||||
E14000573,Blackpool South,44
|
||||
E14000574,Blaydon,45
|
||||
E14000575,Blyth Valley,46
|
||||
E14000576,Bognor Regis and Littlehampton,47
|
||||
E14000577,Bolsover,48
|
||||
E14000578,Bolton North East,49
|
||||
E14000579,Bolton South East,50
|
||||
E14000830,Newbury,51
|
||||
E14000831,Newcastle upon Tyne Central,52
|
||||
E14000832,Newcastle upon Tyne East,53
|
||||
E14000833,Newcastle upon Tyne North,54
|
||||
E14000834,Newcastle-under-Lyme,55
|
||||
E14000835,Newton Abbot,56
|
||||
E14000836,"Normanton, Pontefract and Castleford",57
|
||||
E14000837,North Cornwall,58
|
||||
E14000838,North Devon,59
|
||||
E14000839,North Dorset,60
|
||||
E14000840,North Durham,61
|
||||
E14000841,North East Bedfordshire,62
|
||||
E14000842,North East Cambridgeshire,63
|
||||
E14000843,North East Derbyshire,64
|
||||
E14000844,North East Hampshire,65
|
||||
E14000845,North East Hertfordshire,66
|
||||
E14000846,North East Somerset,67
|
||||
E14000847,North Herefordshire,68
|
||||
E14000848,North Norfolk,69
|
||||
E14000849,North Shropshire,70
|
||||
E14000850,North Somerset,71
|
||||
E14000851,North Swindon,72
|
||||
E14000852,North Thanet,73
|
||||
E14000853,North Tyneside,74
|
||||
E14000854,North Warwickshire,75
|
||||
E14000855,North West Cambridgeshire,76
|
||||
E14000856,North West Durham,77
|
||||
E14000857,North West Hampshire,78
|
||||
E14000858,North West Leicestershire,79
|
||||
E14000859,North West Norfolk,80
|
||||
E14000860,North Wiltshire,81
|
||||
E14000861,Northampton North,82
|
||||
E14000862,Northampton South,83
|
||||
E14000863,Norwich North,84
|
||||
E14000864,Norwich South,85
|
||||
E14000865,Nottingham East,86
|
||||
E14000866,Nottingham North,87
|
||||
E14000867,Nottingham South,88
|
||||
E14000868,Nuneaton,89
|
||||
E14000869,Old Bexley and Sidcup,90
|
||||
E14000870,Oldham East and Saddleworth,91
|
||||
E14000871,Oldham West and Royton,92
|
||||
E14000872,Orpington,93
|
||||
E14000873,Oxford East,94
|
||||
E14000874,Oxford West and Abingdon,95
|
||||
E14000875,Pendle,96
|
||||
E14000876,Penistone and Stocksbridge,97
|
||||
E14000877,Penrith and The Border,98
|
||||
E14000878,Peterborough,99
|
||||
E14000879,"Plymouth, Moor View",100
|
||||
E14000580,Bolton West,101
|
||||
E14000581,Bootle,102
|
||||
E14000582,Boston and Skegness,103
|
||||
E14000583,Bosworth,104
|
||||
E14000584,Bournemouth East,105
|
||||
E14000585,Bournemouth West,106
|
||||
E14000586,Bracknell,107
|
||||
E14000587,Bradford East,108
|
||||
E14000588,Bradford South,109
|
||||
E14000589,Bradford West,110
|
||||
E14000590,Braintree,111
|
||||
E14000591,Brent Central,112
|
||||
E14000592,Brent North,113
|
||||
E14000593,Brentford and Isleworth,114
|
||||
E14000594,Brentwood and Ongar,115
|
||||
E14000595,Bridgwater and West Somerset,116
|
||||
E14000596,Brigg and Goole,117
|
||||
E14000597,"Brighton, Kemptown",118
|
||||
E14000598,"Brighton, Pavilion",119
|
||||
E14000599,Bristol East,120
|
||||
E14000600,Bristol North West,121
|
||||
E14000601,Bristol South,122
|
||||
E14000602,Bristol West,123
|
||||
E14000603,Broadland,124
|
||||
E14000604,Bromley and Chislehurst,125
|
||||
E14000605,Bromsgrove,126
|
||||
E14000606,Broxbourne,127
|
||||
E14000607,Broxtowe,128
|
||||
E14000608,Buckingham,129
|
||||
E14000609,Burnley,130
|
||||
E14000610,Burton,131
|
||||
E14000611,Bury North,132
|
||||
E14000612,Bury South,133
|
||||
E14000613,Bury St Edmunds,134
|
||||
E14000614,Calder Valley,135
|
||||
E14000615,Camberwell and Peckham,136
|
||||
E14000616,Camborne and Redruth,137
|
||||
E14000617,Cambridge,138
|
||||
E14000618,Cannock Chase,139
|
||||
E14000619,Canterbury,140
|
||||
E14000620,Carlisle,141
|
||||
E14000621,Carshalton and Wallington,142
|
||||
E14000622,Castle Point,143
|
||||
E14000623,Central Devon,144
|
||||
E14000624,Central Suffolk and North Ipswich,145
|
||||
E14000625,Charnwood,146
|
||||
E14000626,Chatham and Aylesford,147
|
||||
E14000627,Cheadle,148
|
||||
E14000628,Chelmsford,149
|
||||
E14000629,Chelsea and Fulham,150
|
||||
E14000630,Cheltenham,151
|
||||
E14000631,Chesham and Amersham,152
|
||||
E14000632,Chesterfield,153
|
||||
E14000633,Chichester,154
|
||||
E14000634,Chingford and Woodford Green,155
|
||||
E14000635,Chippenham,156
|
||||
E14000636,Chipping Barnet,157
|
||||
E14000637,Chorley,158
|
||||
E14000638,Christchurch,159
|
||||
E14000639,Cities of London and Westminster,160
|
||||
E14000640,City of Chester,161
|
||||
E14000641,City of Durham,162
|
||||
E14000642,Clacton,163
|
||||
E14000643,Cleethorpes,164
|
||||
E14000644,Colchester,165
|
||||
E14000645,Colne Valley,166
|
||||
E14000646,Congleton,167
|
||||
E14000647,Copeland,168
|
||||
E14000648,Corby,169
|
||||
E14000649,Coventry North East,170
|
||||
E14000650,Coventry North West,171
|
||||
E14000651,Coventry South,172
|
||||
E14000652,Crawley,173
|
||||
E14000653,Crewe and Nantwich,174
|
||||
E14000654,Croydon Central,175
|
||||
E14000655,Croydon North,176
|
||||
E14000656,Croydon South,177
|
||||
E14000657,Dagenham and Rainham,178
|
||||
E14000658,Darlington,179
|
||||
E14000659,Dartford,180
|
||||
E14000660,Daventry,181
|
||||
E14000661,Denton and Reddish,182
|
||||
E14000662,Derby North,183
|
||||
E14000663,Derby South,184
|
||||
E14000664,Derbyshire Dales,185
|
||||
E14000665,Devizes,186
|
||||
E14000666,Dewsbury,187
|
||||
E14000667,Don Valley,188
|
||||
E14000668,Doncaster Central,189
|
||||
E14000669,Doncaster North,190
|
||||
E14000670,Dover,191
|
||||
E14000671,Dudley North,192
|
||||
E14000672,Dudley South,193
|
||||
E14000673,Dulwich and West Norwood,194
|
||||
E14000674,Ealing Central and Acton,195
|
||||
E14000675,Ealing North,196
|
||||
E14000676,"Ealing, Southall",197
|
||||
E14000677,Easington,198
|
||||
E14000678,East Devon,199
|
||||
E14000679,East Ham,200
|
||||
E14000780,Leeds North West,201
|
||||
E14000781,Leeds West,202
|
||||
E14000782,Leicester East,203
|
||||
E14000783,Leicester South,204
|
||||
E14000784,Leicester West,205
|
||||
E14000785,Leigh,206
|
||||
E14000786,Lewes,207
|
||||
E14000787,Lewisham East,208
|
||||
E14000788,Lewisham West and Penge,209
|
||||
E14000789,"Lewisham, Deptford",210
|
||||
E14000790,Leyton and Wanstead,211
|
||||
E14000791,Lichfield,212
|
||||
E14000792,Lincoln,213
|
||||
E14000793,"Liverpool, Riverside",214
|
||||
E14000794,"Liverpool, Walton",215
|
||||
E14000795,"Liverpool, Wavertree",216
|
||||
E14000796,"Liverpool, West Derby",217
|
||||
E14000797,Loughborough,218
|
||||
E14000798,Louth and Horncastle,219
|
||||
E14000799,Ludlow,220
|
||||
E14000800,Luton North,221
|
||||
E14000801,Luton South,222
|
||||
E14000802,Macclesfield,223
|
||||
E14000803,Maidenhead,224
|
||||
E14000804,Maidstone and The Weald,225
|
||||
E14000805,Makerfield,226
|
||||
E14000806,Maldon,227
|
||||
E14000807,Manchester Central,228
|
||||
E14000808,"Manchester, Gorton",229
|
||||
E14000809,"Manchester, Withington",230
|
||||
E14000810,Mansfield,231
|
||||
E14000811,Meon Valley,232
|
||||
E14000812,Meriden,233
|
||||
E14000813,Mid Bedfordshire,234
|
||||
E14000814,Mid Derbyshire,235
|
||||
E14000815,Mid Dorset and North Poole,236
|
||||
E14000816,Mid Norfolk,237
|
||||
E14000817,Mid Sussex,238
|
||||
E14000818,Mid Worcestershire,239
|
||||
E14000819,Middlesbrough,240
|
||||
E14000820,Middlesbrough South and East Cleveland,241
|
||||
E14000821,Milton Keynes North,242
|
||||
E14000822,Milton Keynes South,243
|
||||
E14000823,Mitcham and Morden,244
|
||||
E14000824,Mole Valley,245
|
||||
E14000825,Morecambe and Lunesdale,246
|
||||
E14000826,Morley and Outwood,247
|
||||
E14000827,New Forest East,248
|
||||
E14000828,New Forest West,249
|
||||
E14000829,Newark,250
|
||||
E14000680,East Hampshire,251
|
||||
E14000681,East Surrey,252
|
||||
E14000682,East Worthing and Shoreham,253
|
||||
E14000683,East Yorkshire,254
|
||||
E14000880,"Plymouth, Sutton and Devonport",255
|
||||
E14000684,Eastbourne,256
|
||||
E14000685,Eastleigh,257
|
||||
E14000881,Poole,258
|
||||
E14000686,Eddisbury,259
|
||||
E14000882,Poplar and Limehouse,260
|
||||
E14000687,Edmonton,261
|
||||
E14000883,Portsmouth North,262
|
||||
E14000688,Ellesmere Port and Neston,263
|
||||
E14000884,Portsmouth South,264
|
||||
E14000689,Elmet and Rothwell,265
|
||||
E14000885,Preston,266
|
||||
E14000690,Eltham,267
|
||||
E14000886,Pudsey,268
|
||||
E14000691,Enfield North,269
|
||||
E14000887,Putney,270
|
||||
E14000692,"Enfield, Southgate",271
|
||||
E14000888,Rayleigh and Wickford,272
|
||||
E14000693,Epping Forest,273
|
||||
E14000889,Reading East,274
|
||||
E14000694,Epsom and Ewell,275
|
||||
E14000890,Reading West,276
|
||||
E14000695,Erewash,277
|
||||
E14000891,Redcar,278
|
||||
E14000696,Erith and Thamesmead,279
|
||||
E14000892,Redditch,280
|
||||
E14000697,Esher and Walton,281
|
||||
E14000893,Reigate,282
|
||||
E14000698,Exeter,283
|
||||
E14000894,Ribble Valley,284
|
||||
E14000699,Fareham,285
|
||||
E14000895,Richmond (Yorks),286
|
||||
E14000700,Faversham and Mid Kent,287
|
||||
E14000896,Richmond Park,288
|
||||
E14000701,Feltham and Heston,289
|
||||
E14000897,Rochdale,290
|
||||
E14000702,Filton and Bradley Stoke,291
|
||||
E14000898,Rochester and Strood,292
|
||||
E14000703,Finchley and Golders Green,293
|
||||
E14000899,Rochford and Southend East,294
|
||||
E14000704,Folkestone and Hythe,295
|
||||
E14000900,Romford,296
|
||||
E14000705,Forest of Dean,297
|
||||
E14000901,Romsey and Southampton North,298
|
||||
E14000706,Fylde,299
|
||||
E14000902,Rossendale and Darwen,300
|
||||
E14000707,Gainsborough,301
|
||||
E14000903,Rother Valley,302
|
||||
E14000904,Rotherham,303
|
||||
E14000905,Rugby,304
|
||||
E14000906,"Ruislip, Northwood and Pinner",305
|
||||
E14000907,Runnymede and Weybridge,306
|
||||
E14000908,Rushcliffe,307
|
||||
E14000909,Rutland and Melton,308
|
||||
E14000910,Saffron Walden,309
|
||||
E14000911,Salford and Eccles,310
|
||||
E14000912,Salisbury,311
|
||||
E14000913,Scarborough and Whitby,312
|
||||
E14000914,Scunthorpe,313
|
||||
E14000915,Sedgefield,314
|
||||
E14000916,Sefton Central,315
|
||||
E14000917,Selby and Ainsty,316
|
||||
E14000918,Sevenoaks,317
|
||||
E14000919,Sheffield Central,318
|
||||
E14000920,Sheffield South East,319
|
||||
E14000921,"Sheffield, Brightside and Hillsborough",320
|
||||
E14000922,"Sheffield, Hallam",321
|
||||
E14000923,"Sheffield, Heeley",322
|
||||
E14000924,Sherwood,323
|
||||
E14000925,Shipley,324
|
||||
E14000926,Shrewsbury and Atcham,325
|
||||
E14000927,Sittingbourne and Sheppey,326
|
||||
E14000928,Skipton and Ripon,327
|
||||
E14000929,Sleaford and North Hykeham,328
|
||||
E14000730,Harrogate and Knaresborough,329
|
||||
E14000731,Harrow East,330
|
||||
E14000732,Harrow West,331
|
||||
E14000733,Hartlepool,332
|
||||
E14000734,Harwich and North Essex,333
|
||||
E14000735,Hastings and Rye,334
|
||||
E14000736,Havant,335
|
||||
E14000737,Hayes and Harlington,336
|
||||
E14000738,Hazel Grove,337
|
||||
E14000739,Hemel Hempstead,338
|
||||
E14000740,Hemsworth,339
|
||||
E14000741,Hendon,340
|
||||
E14000742,Henley,341
|
||||
E14000743,Hereford and South Herefordshire,342
|
||||
E14000744,Hertford and Stortford,343
|
||||
E14000745,Hertsmere,344
|
||||
E14000746,Hexham,345
|
||||
E14000747,Heywood and Middleton,346
|
||||
E14000748,High Peak,347
|
||||
E14000749,Hitchin and Harpenden,348
|
||||
E14000750,Holborn and St Pancras,349
|
||||
E14000751,Hornchurch and Upminster,350
|
||||
E14000752,Hornsey and Wood Green,351
|
||||
E14000753,Horsham,352
|
||||
E14000754,Houghton and Sunderland South,353
|
||||
E14000755,Hove,354
|
||||
E14000756,Huddersfield,355
|
||||
E14000757,Huntingdon,356
|
||||
E14000758,Hyndburn,357
|
||||
E14000759,Ilford North,358
|
||||
E14000760,Ilford South,359
|
||||
E14000761,Ipswich,360
|
||||
E14000762,Isle of Wight,361
|
||||
E14000763,Islington North,362
|
||||
E14000764,Islington South and Finsbury,363
|
||||
E14000765,Jarrow,364
|
||||
E14000766,Keighley,365
|
||||
E14000767,Kenilworth and Southam,366
|
||||
E14000768,Kensington,367
|
||||
E14000769,Kettering,368
|
||||
E14000770,Kingston and Surbiton,369
|
||||
E14000771,Kingston upon Hull East,370
|
||||
E14000772,Kingston upon Hull North,371
|
||||
E14000773,Kingston upon Hull West and Hessle,372
|
||||
E14000774,Kingswood,373
|
||||
E14000775,Knowsley,374
|
||||
E14000776,Lancaster and Fleetwood,375
|
||||
E14000777,Leeds Central,376
|
||||
E14000778,Leeds East,377
|
||||
E14000779,Leeds North East,378
|
||||
E14000708,Garston and Halewood,379
|
||||
E14000709,Gateshead,380
|
||||
E14000710,Gedling,381
|
||||
E14000711,Gillingham and Rainham,382
|
||||
E14000712,Gloucester,383
|
||||
E14000713,Gosport,384
|
||||
E14000714,Grantham and Stamford,385
|
||||
E14000715,Gravesham,386
|
||||
E14000716,Great Grimsby,387
|
||||
E14000717,Great Yarmouth,388
|
||||
E14000718,Greenwich and Woolwich,389
|
||||
E14000719,Guildford,390
|
||||
E14000720,Hackney North and Stoke Newington,391
|
||||
E14000721,Hackney South and Shoreditch,392
|
||||
E14000722,Halesowen and Rowley Regis,393
|
||||
E14000723,Halifax,394
|
||||
E14000724,Haltemprice and Howden,395
|
||||
E14000725,Halton,396
|
||||
E14000726,Hammersmith,397
|
||||
E14000727,Hampstead and Kilburn,398
|
||||
E14000728,Harborough,399
|
||||
E14000729,Harlow,400
|
||||
E14000930,Slough,401
|
||||
E14000931,Solihull,402
|
||||
E14000932,Somerton and Frome,403
|
||||
E14000933,South Basildon and East Thurrock,404
|
||||
E14000934,South Cambridgeshire,405
|
||||
E14000935,South Derbyshire,406
|
||||
E14000936,South Dorset,407
|
||||
E14000937,South East Cambridgeshire,408
|
||||
E14000938,South East Cornwall,409
|
||||
E14000939,South Holland and The Deepings,410
|
||||
E14000940,South Leicestershire,411
|
||||
E14000941,South Norfolk,412
|
||||
E14000942,South Northamptonshire,413
|
||||
E14000943,South Ribble,414
|
||||
E14000944,South Shields,415
|
||||
E14000945,South Staffordshire,416
|
||||
E14000946,South Suffolk,417
|
||||
E14000947,South Swindon,418
|
||||
E14000948,South Thanet,419
|
||||
E14000949,South West Bedfordshire,420
|
||||
E14000950,South West Devon,421
|
||||
E14000951,South West Hertfordshire,422
|
||||
E14000952,South West Norfolk,423
|
||||
E14000953,South West Surrey,424
|
||||
E14000954,South West Wiltshire,425
|
||||
E14000955,"Southampton, Itchen",426
|
||||
E14000956,"Southampton, Test",427
|
||||
E14000957,Southend West,428
|
||||
E14000958,Southport,429
|
||||
E14000959,Spelthorne,430
|
||||
E14000960,St Albans,431
|
||||
E14000961,St Austell and Newquay,432
|
||||
E14000962,St Helens North,433
|
||||
E14000963,St Helens South and Whiston,434
|
||||
E14000964,St Ives,435
|
||||
E14000965,Stafford,436
|
||||
E14000966,Staffordshire Moorlands,437
|
||||
E14000967,Stalybridge and Hyde,438
|
||||
E14000968,Stevenage,439
|
||||
E14000969,Stockport,440
|
||||
E14000970,Stockton North,441
|
||||
E14000971,Stockton South,442
|
||||
E14000972,Stoke-on-Trent Central,443
|
||||
E14000973,Stoke-on-Trent North,444
|
||||
E14000974,Stoke-on-Trent South,445
|
||||
E14000975,Stone,446
|
||||
E14000976,Stourbridge,447
|
||||
E14000977,Stratford-on-Avon,448
|
||||
E14000978,Streatham,449
|
||||
E14000979,Stretford and Urmston,450
|
||||
E14000980,Stroud,451
|
||||
E14000981,Suffolk Coastal,452
|
||||
E14000982,Sunderland Central,453
|
||||
E14000983,Surrey Heath,454
|
||||
E14000984,Sutton and Cheam,455
|
||||
E14000985,Sutton Coldfield,456
|
||||
E14000986,Tamworth,457
|
||||
E14000987,Tatton,458
|
||||
E14000988,Taunton Deane,459
|
||||
E14000989,Telford,460
|
||||
E14000990,Tewkesbury,461
|
||||
E14000991,The Cotswolds,462
|
||||
E14000992,The Wrekin,463
|
||||
E14000993,Thirsk and Malton,464
|
||||
E14000994,Thornbury and Yate,465
|
||||
E14000995,Thurrock,466
|
||||
E14000996,Tiverton and Honiton,467
|
||||
E14000997,Tonbridge and Malling,468
|
||||
E14000998,Tooting,469
|
||||
E14000999,Torbay,470
|
||||
E14001000,Torridge and West Devon,471
|
||||
E14001001,Totnes,472
|
||||
E14001002,Tottenham,473
|
||||
E14001003,Truro and Falmouth,474
|
||||
E14001004,Tunbridge Wells,475
|
||||
E14001005,Twickenham,476
|
||||
E14001006,Tynemouth,477
|
||||
E14001007,Uxbridge and South Ruislip,478
|
||||
E14001008,Vauxhall,479
|
||||
E14001009,Wakefield,480
|
||||
E14001010,Wallasey,481
|
||||
E14001011,Walsall North,482
|
||||
E14001012,Walsall South,483
|
||||
E14001013,Walthamstow,484
|
||||
E14001014,Wansbeck,485
|
||||
E14001015,Wantage,486
|
||||
E14001016,Warley,487
|
||||
E14001017,Warrington North,488
|
||||
E14001018,Warrington South,489
|
||||
E14001019,Warwick and Leamington,490
|
||||
E14001020,Washington and Sunderland West,491
|
||||
E14001021,Watford,492
|
||||
E14001022,Waveney,493
|
||||
E14001023,Wealden,494
|
||||
E14001024,Weaver Vale,495
|
||||
E14001025,Wellingborough,496
|
||||
E14001026,Wells,497
|
||||
E14001027,Welwyn Hatfield,498
|
||||
E14001028,Wentworth and Dearne,499
|
||||
E14001029,West Bromwich East,500
|
||||
E14001030,West Bromwich West,501
|
||||
E14001031,West Dorset,502
|
||||
E14001032,West Ham,503
|
||||
E14001033,West Lancashire,504
|
||||
E14001034,West Suffolk,505
|
||||
E14001035,West Worcestershire,506
|
||||
E14001036,Westminster North,507
|
||||
E14001037,Westmorland and Lonsdale,508
|
||||
E14001038,Weston-Super-Mare,509
|
||||
E14001039,Wigan,510
|
||||
E14001040,Wimbledon,511
|
||||
E14001041,Winchester,512
|
||||
E14001042,Windsor,513
|
||||
E14001043,Wirral South,514
|
||||
E14001044,Wirral West,515
|
||||
E14001045,Witham,516
|
||||
E14001046,Witney,517
|
||||
E14001047,Woking,518
|
||||
E14001048,Wokingham,519
|
||||
E14001049,Wolverhampton North East,520
|
||||
E14001050,Wolverhampton South East,521
|
||||
E14001051,Wolverhampton South West,522
|
||||
E14001052,Worcester,523
|
||||
E14001053,Workington,524
|
||||
E14001054,Worsley and Eccles South,525
|
||||
E14001055,Worthing West,526
|
||||
E14001056,Wycombe,527
|
||||
E14001057,Wyre and Preston North,528
|
||||
E14001058,Wyre Forest,529
|
||||
E14001059,Wythenshawe and Sale East,530
|
||||
E14001060,Yeovil,531
|
||||
E14001061,York Central,532
|
||||
E14001062,York Outer,533
|
||||
N06000001,Belfast East,534
|
||||
N06000002,Belfast North,535
|
||||
N06000003,Belfast South,536
|
||||
N06000004,Belfast West,537
|
||||
N06000005,East Antrim,538
|
||||
N06000006,East Londonderry,539
|
||||
N06000007,Fermanagh and South Tyrone,540
|
||||
N06000008,Foyle,541
|
||||
N06000009,Lagan Valley,542
|
||||
N06000010,Mid Ulster,543
|
||||
N06000011,Newry and Armagh,544
|
||||
N06000012,North Antrim,545
|
||||
N06000013,North Down,546
|
||||
N06000014,South Antrim,547
|
||||
N06000015,South Down,548
|
||||
N06000016,Strangford,549
|
||||
N06000017,Upper Bann,550
|
||||
S14000050,Ochil and South Perthshire,551
|
||||
S14000051,Orkney and Shetland,552
|
||||
S14000052,Paisley and Renfrewshire North,553
|
||||
S14000053,Paisley and Renfrewshire South,554
|
||||
S14000054,Perth and North Perthshire,555
|
||||
S14000055,"Ross, Skye and Lochaber",556
|
||||
S14000056,Rutherglen and Hamilton West,557
|
||||
S14000057,Stirling,558
|
||||
S14000058,West Aberdeenshire and Kincardine,559
|
||||
S14000059,West Dunbartonshire,560
|
||||
W07000041,Ynys Môn,561
|
||||
W07000042,Delyn,562
|
||||
W07000043,Alyn and Deeside,563
|
||||
W07000044,Wrexham,564
|
||||
W07000045,Llanelli,565
|
||||
W07000046,Gower,566
|
||||
W07000047,Swansea West,567
|
||||
W07000048,Swansea East,568
|
||||
W07000049,Aberavon,569
|
||||
W07000050,Cardiff Central,570
|
||||
W07000051,Cardiff North,571
|
||||
W07000052,Rhondda,572
|
||||
W07000053,Torfaen,573
|
||||
W07000054,Monmouth,574
|
||||
W07000055,Newport East,575
|
||||
W07000056,Newport West,576
|
||||
W07000057,Arfon,577
|
||||
W07000058,Aberconwy,578
|
||||
W07000059,Clwyd West,579
|
||||
W07000060,Vale of Clwyd,580
|
||||
W07000061,Dwyfor Meirionnydd,581
|
||||
W07000062,Clwyd South,582
|
||||
W07000063,Montgomeryshire,583
|
||||
W07000064,Ceredigion,584
|
||||
W07000065,Preseli Pembrokeshire,585
|
||||
W07000066,Carmarthen West and South Pembrokeshire,586
|
||||
W07000067,Carmarthen East and Dinefwr,587
|
||||
W07000068,Brecon and Radnorshire,588
|
||||
W07000069,Neath,589
|
||||
W07000070,Cynon Valley,590
|
||||
W07000071,Merthyr Tydfil and Rhymney,591
|
||||
W07000072,Blaenau Gwent,592
|
||||
W07000073,Bridgend,593
|
||||
W07000074,Ogmore,594
|
||||
W07000075,Pontypridd,595
|
||||
W07000076,Caerphilly,596
|
||||
W07000077,Islwyn,597
|
||||
W07000078,Vale of Glamorgan,598
|
||||
W07000079,Cardiff West,599
|
||||
W07000080,Cardiff South and Penarth,600
|
||||
N06000018,West Tyrone,601
|
||||
S14000001,Aberdeen North,602
|
||||
S14000002,Aberdeen South,603
|
||||
S14000003,Airdrie and Shotts,604
|
||||
S14000004,Angus,605
|
||||
S14000005,Argyll and Bute,606
|
||||
S14000006,"Ayr, Carrick and Cumnock",607
|
||||
S14000007,Banff and Buchan,608
|
||||
S14000008,"Berwickshire, Roxburgh and Selkirk",609
|
||||
S14000009,"Caithness, Sutherland and Easter Ross",610
|
||||
S14000010,Central Ayrshire,611
|
||||
S14000011,"Coatbridge, Chryston and Bellshill",612
|
||||
S14000012,"Cumbernauld, Kilsyth and Kirkintilloch East",613
|
||||
S14000013,Dumfries and Galloway,614
|
||||
S14000014,"Dumfriesshire, Clydesdale and Tweeddale",615
|
||||
S14000015,Dundee East,616
|
||||
S14000016,Dundee West,617
|
||||
S14000017,Dunfermline and West Fife,618
|
||||
S14000018,East Dunbartonshire,619
|
||||
S14000019,"East Kilbride, Strathaven and Lesmahagow",620
|
||||
S14000020,East Lothian,621
|
||||
S14000021,East Renfrewshire,622
|
||||
S14000022,Edinburgh East,623
|
||||
S14000023,Edinburgh North and Leith,624
|
||||
S14000024,Edinburgh South,625
|
||||
S14000025,Edinburgh South West,626
|
||||
S14000026,Edinburgh West,627
|
||||
S14000027,Na h-Eileanan an Iar,628
|
||||
S14000028,Falkirk,629
|
||||
S14000029,Glasgow Central,630
|
||||
S14000030,Glasgow East,631
|
||||
S14000031,Glasgow North,632
|
||||
S14000032,Glasgow North East,633
|
||||
S14000033,Glasgow North West,634
|
||||
S14000034,Glasgow South,635
|
||||
S14000035,Glasgow South West,636
|
||||
S14000036,Glenrothes,637
|
||||
S14000037,Gordon,638
|
||||
S14000038,Inverclyde,639
|
||||
S14000039,"Inverness, Nairn, Badenoch and Strathspey",640
|
||||
S14000040,Kilmarnock and Loudoun,641
|
||||
S14000041,Kirkcaldy and Cowdenbeath,642
|
||||
S14000042,Lanark and Hamilton East,643
|
||||
S14000043,Linlithgow and East Falkirk,644
|
||||
S14000044,Livingston,645
|
||||
S14000045,Midlothian,646
|
||||
S14000046,Moray,647
|
||||
S14000047,Motherwell and Wishaw,648
|
||||
S14000048,North Ayrshire and Arran,649
|
||||
S14000049,North East Fife,650
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
requests
|
||||
python-dotenv
|
||||
pandas
|
||||
tqdm
|
||||
|
|
@ -1,26 +1,61 @@
|
|||
from pathlib import Path
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc.settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
COMPONENT_FEATURES,
|
||||
FIXED_FEATURES,
|
||||
COLUMNTYPES,
|
||||
RDSAP_RESPONSE,
|
||||
MAX_SAP_SCORE,
|
||||
fill_na_map,
|
||||
FIXED_DESCRIPTON_MAPPED_FEATURES
|
||||
STARTING_SUFFIX_COMPONENT_COLS,
|
||||
NO_SUFFIX_COMPONENT_COLS,
|
||||
ENDING_SUFFIX_COMPONENT_COLS
|
||||
)
|
||||
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
|
||||
|
||||
from typing import List
|
||||
|
||||
# These lookups are used to clean the construction age band
|
||||
bounds_map = {
|
||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
}
|
||||
|
||||
remap = {
|
||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||
}
|
||||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
}
|
||||
|
||||
|
||||
def is_int(x):
|
||||
try:
|
||||
int(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
"""
|
||||
|
|
@ -46,66 +81,36 @@ class DataProcessor:
|
|||
def insert_data(self, data: pd.DataFrame) -> None:
|
||||
self.data = data
|
||||
|
||||
@staticmethod
|
||||
def clean_construction_age_band(x):
|
||||
# Firstly, we check if it's an error value
|
||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||
return x
|
||||
|
||||
# Next, we check if it's a value in our map
|
||||
if bounds_map.get(x):
|
||||
return x
|
||||
|
||||
# We check if it's a standard remap value
|
||||
remap_value = remap.get(x, None)
|
||||
if remap_value:
|
||||
return remap_value
|
||||
|
||||
# We check if it's a number
|
||||
if is_int(x):
|
||||
x_int = int(x)
|
||||
return expanded_map[x_int]
|
||||
|
||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||
|
||||
def standardise_construction_age_band(self):
|
||||
"""
|
||||
This function will tidy up some of the non-standard values that are populated in the construction age
|
||||
band, which is useful for cleaning
|
||||
"""
|
||||
bounds_map = {
|
||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
}
|
||||
|
||||
remap = {
|
||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||
}
|
||||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
}
|
||||
|
||||
def is_int(x):
|
||||
try:
|
||||
int(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def clean_construction_age_band(x):
|
||||
# Firstly, we check if it's an error value
|
||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||
return x
|
||||
|
||||
# Next, we check if it's a value in our map
|
||||
if bounds_map.get(x):
|
||||
return x
|
||||
|
||||
# We check if it's a standard remap value
|
||||
remap_value = remap.get(x, None)
|
||||
if remap_value:
|
||||
return remap_value
|
||||
|
||||
# We check if it's a number
|
||||
if is_int(x):
|
||||
x_int = int(x)
|
||||
return expanded_map[x_int]
|
||||
|
||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||
|
||||
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
||||
lambda x: clean_construction_age_band(x)
|
||||
lambda x: self.clean_construction_age_band(x)
|
||||
)
|
||||
|
||||
self.data = self.data[
|
||||
|
|
@ -157,18 +162,6 @@ class DataProcessor:
|
|||
break
|
||||
to_index -= 1
|
||||
|
||||
def reformat_columns(self):
|
||||
"""
|
||||
This function applies the re-formattng of columns from lower case to capitalised
|
||||
|
||||
When requesting the epc data from the api, the columns are lower case
|
||||
and separated by a hyphen, whereas in the bulk download, the columns
|
||||
are capitalised and separated by underscores. If rename_columns is True
|
||||
we convert the columns from lower case to capitalised format
|
||||
:return:
|
||||
"""
|
||||
self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
|
||||
|
||||
def pre_process(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load data and begin initial cleaning
|
||||
|
|
@ -176,22 +169,24 @@ class DataProcessor:
|
|||
if self.data is None:
|
||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||
|
||||
if self.newdata:
|
||||
self.reformat_columns()
|
||||
|
||||
if not self.newdata:
|
||||
self.confine_data()
|
||||
|
||||
self.remap_columns()
|
||||
|
||||
# We have some non-standard construction age bands which we'll clean for matching
|
||||
self.standardise_construction_age_band()
|
||||
self.clean_missing_rooms()
|
||||
if not self.newdata:
|
||||
self.standardise_construction_age_band()
|
||||
|
||||
self.clean_missing_rooms()
|
||||
|
||||
self.recast_df_columns(
|
||||
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
||||
)
|
||||
self.clean_multi_glaze_proportion()
|
||||
|
||||
if not self.newdata:
|
||||
self.clean_multi_glaze_proportion()
|
||||
|
||||
self.clean_photo_supply()
|
||||
|
||||
if not self.newdata:
|
||||
|
|
@ -203,16 +198,24 @@ class DataProcessor:
|
|||
# If we have multiple EPC records, we can try and do filling
|
||||
self.fill_na_fields()
|
||||
|
||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
if not self.newdata:
|
||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
# Final re-casting after data transformed and prepared
|
||||
self.data = self.data.astype(COLUMNTYPES)
|
||||
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
|
||||
self.data = self.data.astype(coltypes)
|
||||
|
||||
self.na_remapping()
|
||||
|
||||
return self.data
|
||||
|
||||
def na_remapping(self):
|
||||
for column, fill_value in fill_na_map.items():
|
||||
|
||||
fill_na_map_apply = {
|
||||
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
||||
} if self.newdata else fill_na_map
|
||||
|
||||
for column, fill_value in fill_na_map_apply.items():
|
||||
self.data[column] = self.data[column].fillna(fill_value)
|
||||
|
||||
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
|
||||
|
|
@ -255,7 +258,8 @@ class DataProcessor:
|
|||
data = data.replace(np.NAN, None)
|
||||
|
||||
# Remap certain columns
|
||||
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
||||
if not self.newdata:
|
||||
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
||||
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
||||
|
||||
convert_to_lower = ["TRANSACTION_TYPE"]
|
||||
|
|
@ -348,7 +352,7 @@ class DataProcessor:
|
|||
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
||||
|
||||
# If there still is na values, use average across all properties in consituecy
|
||||
# If there still is na values, use average across all epc in consituecy
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||
variable
|
||||
].fillna(cleaning_averages_filled[variable].mean())
|
||||
|
|
@ -497,9 +501,15 @@ class DataProcessor:
|
|||
"""
|
||||
|
||||
if suffix not in ["_STARTING", "_ENDING"]:
|
||||
raise Exception("Suffix should be one of _STARTING or _ENFING")
|
||||
raise Exception("Suffix should be one of _STARTING or _ENDING")
|
||||
|
||||
return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
|
||||
if suffix == "_STARTING":
|
||||
starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
|
||||
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy()
|
||||
|
||||
return pd.concat([starting_cols, fixed_cols], axis=1)
|
||||
|
||||
return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
|
||||
|
||||
def get_fixed_features(self) -> pd.DataFrame:
|
||||
"""
|
||||
|
|
@ -529,125 +539,33 @@ class DataProcessor:
|
|||
|
||||
return df
|
||||
|
||||
@classmethod
|
||||
def difference_data(cls, df: pd.DataFrame):
|
||||
@staticmethod
|
||||
def calculate_days_to(lodgement_date):
|
||||
|
||||
"""
|
||||
Given a dataframe and starting and ending columns, this function will convert the features to
|
||||
differenced the ending subtract the starting value, which is useful for modelling the difference responces
|
||||
"""
|
||||
if isinstance(lodgement_date, str):
|
||||
return (
|
||||
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).days
|
||||
|
||||
# We ensure that the u value columns are co-erced to a numerical format
|
||||
uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col]
|
||||
for uvalue_col in uvalue_columns:
|
||||
df[uvalue_col] = pd.to_numeric(df[uvalue_col])
|
||||
return (
|
||||
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
|
||||
key_columns = [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE",
|
||||
"SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
"SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING",
|
||||
"DAYS_TO_STARTING", "DAYS_TO_ENDING"
|
||||
]
|
||||
@staticmethod
|
||||
def clean_missings_after_description_process(df, ignore_cols=None):
|
||||
missings = pd.isnull(df).sum()
|
||||
missings = missings[missings > 0]
|
||||
|
||||
ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
|
||||
if ignore_cols:
|
||||
missings = missings[~missings.index.isin(ignore_cols)]
|
||||
|
||||
columns = {x for x in df.columns if x not in ignore_cols}
|
||||
|
||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
||||
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
|
||||
|
||||
df = pd.get_dummies(df, columns=non_numerical_columns)
|
||||
|
||||
# We make sure there is a starting and ending version of the column
|
||||
diff_columns = []
|
||||
no_diff_columns = [] # Store for debugging
|
||||
for col in columns:
|
||||
if "_ENDING" in col:
|
||||
# Don't keep the endings
|
||||
continue
|
||||
for col in missings.index:
|
||||
unique_values = df[col].unique()
|
||||
if True in unique_values or False in unique_values:
|
||||
df[col] = df[col].fillna(False)
|
||||
if "none" in unique_values:
|
||||
df[col] = df[col].fillna("none")
|
||||
else:
|
||||
# We have a starting column so check if we have an ending
|
||||
if col.replace("_STARTING", "") + "_ENDING" in columns:
|
||||
diff_columns.append(col)
|
||||
else:
|
||||
no_diff_columns.append(col)
|
||||
|
||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
||||
raise Exception("Something went wrong, potentially missed a differencing column")
|
||||
|
||||
datatypes = df.dtypes
|
||||
|
||||
# Note: We also difference columns like floor area and floor height. We should experiement with this.
|
||||
# Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
|
||||
# the starting value, therefore to explain any differences in the new floor area, it may be enough to
|
||||
# just consider the difference however we can play around with this.
|
||||
|
||||
# Do the differencing
|
||||
cols_to_append = {}
|
||||
for starting_col in diff_columns:
|
||||
|
||||
base_col = starting_col.replace("_STARTING", "")
|
||||
|
||||
if "_STARTING" in starting_col:
|
||||
ending_col = starting_col.replace("_STARTING", "_ENDING")
|
||||
else:
|
||||
ending_col = starting_col + "_ENDING"
|
||||
|
||||
if starting_col not in non_numerical_columns:
|
||||
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
|
||||
df = df.drop(columns=[starting_col, ending_col])
|
||||
continue
|
||||
|
||||
level_values = list(set(levels[starting_col] + levels[ending_col]))
|
||||
|
||||
level_cols = []
|
||||
for level in level_values:
|
||||
starting_level_col = "_".join([starting_col, str(level)])
|
||||
ending_level_col = "_".join([ending_col, str(level)])
|
||||
|
||||
if starting_level_col not in df.columns:
|
||||
# We have no starting, just ending
|
||||
col_type = datatypes[ending_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
|
||||
|
||||
level_cols.append(ending_level_col)
|
||||
|
||||
elif ending_level_col not in df.columns:
|
||||
# We have no ending, just starting
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
|
||||
|
||||
level_cols.append(starting_level_col)
|
||||
|
||||
else:
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = (
|
||||
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
|
||||
)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
|
||||
|
||||
level_cols.extend([starting_level_col, ending_level_col])
|
||||
|
||||
# Drop the columns
|
||||
df = df.drop(columns=level_cols)
|
||||
|
||||
cols_to_append = pd.DataFrame(cols_to_append)
|
||||
df = pd.concat([df, cols_to_append], axis=1)
|
||||
|
||||
# Perform a final coercing of string True/False columns to boolean
|
||||
df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
|
||||
df[col] = df[col].fillna("Unknown")
|
||||
|
||||
return df
|
||||
|
|
@ -4,25 +4,24 @@ from tqdm import tqdm
|
|||
import msgpack
|
||||
|
||||
from pathlib import Path
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
from etl.epc.settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
EARLIEST_EPC_DATE,
|
||||
CARBON_RESPONSE,
|
||||
)
|
||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||
from recommendations.recommendation_utils import (
|
||||
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
|
||||
get_wall_type
|
||||
)
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
def get_cleaned():
|
||||
|
|
@ -364,21 +363,6 @@ def make_uvalues(df):
|
|||
return df
|
||||
|
||||
|
||||
def clean_missings_after_description_process(df):
|
||||
missings = pd.isnull(df).sum()
|
||||
missings = missings[missings > 0]
|
||||
for col in missings.index:
|
||||
unique_values = df[col].unique()
|
||||
if True in unique_values or False in unique_values:
|
||||
df[col] = df[col].fillna(False)
|
||||
if "none" in unique_values:
|
||||
df[col] = df[col].fillna("none")
|
||||
else:
|
||||
df[col] = df[col].fillna("Unknown")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -400,6 +384,8 @@ def app():
|
|||
data_processor = DataProcessor(filepath=filepath)
|
||||
|
||||
df = data_processor.pre_process()
|
||||
df[df["WALLS_DESCRIPTION"].str.contains("Cavity")]["WALLS_DESCRIPTION"].unique()
|
||||
|
||||
cleaning_averages = data_processor.make_cleaning_averages()
|
||||
|
||||
# We have some odd cases with missing constituency so we fill
|
||||
|
|
@ -512,12 +498,11 @@ def app():
|
|||
|
||||
# Add some temporal features - we look at the days from the standard starting point in time
|
||||
# for the starting and ending date so all records are from a fixed point
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||
data_by_urpn_df["LODGEMENT_DATE_STARTING"])
|
||||
|
||||
data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
|
||||
data_by_urpn_df["LODGEMENT_DATE_ENDING"])
|
||||
|
||||
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
|
||||
|
|
@ -544,7 +529,7 @@ def app():
|
|||
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
|
||||
# need to
|
||||
|
||||
data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
|
||||
data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
|
||||
|
||||
if pd.isnull(data_by_urpn_df).sum().sum():
|
||||
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
||||
|
|
@ -564,6 +549,12 @@ def app():
|
|||
|
||||
output = pd.concat(dataset)
|
||||
|
||||
# Remove any records that have huge swings in their floor area
|
||||
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
|
||||
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
|
||||
output = output[output["tfa_diff_prop"] < 0.5]
|
||||
output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
|
||||
|
||||
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
|
||||
for uvalue_col in uvalue_columns:
|
||||
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
|
||||
|
|
@ -571,15 +562,7 @@ def app():
|
|||
save_dataframe_to_s3_parquet(
|
||||
df=output,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/dataset_without_differencing.parquet",
|
||||
)
|
||||
|
||||
output = DataProcessor.difference_data(output)
|
||||
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=output,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/dataset_with_differencing.parquet",
|
||||
file_key="sap_change_model/dataset.parquet",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -133,28 +133,6 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
|||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
if 10 <= n % 100 <= 20:
|
||||
suffix = "th"
|
||||
else:
|
||||
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
|
||||
|
||||
return str(n) + suffix
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
"ground floor": 0,
|
||||
"20+": 20,
|
||||
"21st or above": 21,
|
||||
**{str(i).zfill(2): i for i in range(0, 21)},
|
||||
**{ordinal(i): i for i in range(-1, 21)},
|
||||
**{str(i): i for i in range(-1, 21)},
|
||||
**{i: i for i in range(-1, 21)},
|
||||
}
|
||||
|
||||
BUILT_FORM_REMAP = {
|
||||
"Enclosed End-Terrace": "End-Terrace",
|
||||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||||
|
|
@ -212,10 +190,66 @@ fill_na_map = {
|
|||
"NUMBER_OPEN_FIREPLACES": 0
|
||||
}
|
||||
|
||||
# After the property descriptions have been re-remapped, we expect these features to be fixed
|
||||
FIXED_DESCRIPTON_MAPPED_FEATURES = [
|
||||
'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
|
||||
'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
|
||||
'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
|
||||
'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
|
||||
################################################################################################
|
||||
# These are the features we need for scoring
|
||||
# We'll likely change how we do this in the future
|
||||
################################################################################################
|
||||
|
||||
STARTING_SUFFIX_COMPONENT_COLS = [
|
||||
"SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
|
||||
"SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
|
||||
"GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
|
||||
"EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
|
||||
]
|
||||
NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
|
||||
'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
|
||||
'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
|
||||
'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
|
||||
'is_solid', 'another_property_below', 'floor_insulation_thickness',
|
||||
'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
|
||||
'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
|
||||
'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
|
||||
'energy_recovery',
|
||||
'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
|
||||
'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
|
||||
'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
|
||||
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
|
||||
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
|
||||
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
|
||||
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
|
||||
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
|
||||
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
|
||||
'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
|
||||
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
|
||||
'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
|
||||
'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
|
||||
'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
|
||||
'rate_control',
|
||||
'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
|
||||
'no_individual_heating_or_community_network', 'complex_fuel_type',
|
||||
]
|
||||
|
||||
ENDING_SUFFIX_COMPONENT_COLS = [
|
||||
'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
|
||||
'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
|
||||
'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
|
||||
'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
|
||||
'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
|
||||
'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
|
||||
'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
|
||||
'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
|
||||
'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
|
||||
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
|
||||
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
|
||||
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
|
||||
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
|
||||
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
|
||||
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
|
||||
'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
|
||||
'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
|
||||
'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
|
||||
'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
|
||||
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
|
||||
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
|
||||
]
|
||||
|
|
@ -4,16 +4,16 @@ from collections import defaultdict
|
|||
|
||||
import pandas as pd
|
||||
|
||||
from model_data.utils import correct_spelling
|
||||
from model_data.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from model_data.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from model_data.epc_attributes.WallAttributes import WallAttributes
|
||||
from model_data.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from model_data.epc_attributes.LightingAttributes import LightingAttributes
|
||||
from etl.epc_clean.utils import correct_spelling
|
||||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||
|
||||
|
||||
class EpcClean:
|
||||
|
|
@ -130,7 +130,7 @@ class EpcClean:
|
|||
self.cleaned[field].append(
|
||||
{
|
||||
"original_description": description,
|
||||
"clean_description": cln.description.capitalize(),
|
||||
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
|
||||
**cln.process()
|
||||
}
|
||||
)
|
||||
|
|
@ -3,8 +3,8 @@ import os
|
|||
import pandas as pd
|
||||
import msgpack
|
||||
|
||||
from model_data.EpcClean import EpcClean
|
||||
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
||||
from etl.epc_clean.EpcClean import EpcClean
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
from utils.s3 import save_data_to_s3
|
||||
|
||||
|
|
@ -19,7 +19,7 @@ LAND_REGISTRY_PATHS = [
|
|||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
||||
]
|
||||
|
||||
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
EPC_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||
|
||||
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
||||
|
||||
|
|
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
|||
def app():
|
||||
"""
|
||||
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
||||
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
|
||||
and produce a dataset of cleaned fields so that when we get new epc, we can quickly
|
||||
sanitise any description data
|
||||
|
||||
Currently, this application is just run on a local machine
|
||||
|
|
@ -36,9 +36,6 @@ def app():
|
|||
cleaned_data = {}
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
for directory in tqdm(epc_directories):
|
||||
directory_destructured = str(directory).split("/")[-1].split("-")
|
||||
gss_code = directory_destructured[1]
|
||||
local_authority = directory_destructured[2]
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
|
|
@ -62,14 +59,6 @@ def app():
|
|||
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
||||
cleaned_data[k].extend(new_data)
|
||||
|
||||
# TODO: Add property age band into this
|
||||
# uvalue_estimates = UvalueEstimations(data=data)
|
||||
# uvalue_estimates.get_estimates(cleaner=cleaner)
|
||||
# # TODO: Store these to a s3
|
||||
# uvalue_estimates.walls
|
||||
# uvalue_estimates.floors
|
||||
# uvalue_estimates.roofs
|
||||
|
||||
# Basic check to make sure all descriptions are unique
|
||||
for _, cleaned in cleaned_data.items():
|
||||
descriptions = [x["original_description"] for x in cleaned]
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
|
||||
|
||||
|
||||
class FloorAttributes(Definitions):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||
|
||||
|
||||
class HotWaterAttributes(Definitions):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import re
|
||||
from model_data.epc_attributes.attribute_utils import clean_description
|
||||
from model_data.utils import correct_spelling
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||
from etl.epc_clean.utils import correct_spelling
|
||||
|
||||
|
||||
class LightingAttributes:
|
||||
|
|
@ -27,7 +27,7 @@ class LightingAttributes:
|
|||
lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)
|
||||
|
||||
if lel_match is not None or lel_match2 is not None:
|
||||
|
||||
|
||||
# Perform the actual translation
|
||||
percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
|
||||
self.description = f"low energy lighting in {percentage}% of fixed outlets"
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
|
||||
|
||||
|
||||
class MainFuelAttributes(Definitions):
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
|
||||
from typing import Dict, Union
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||
|
||||
|
||||
class MainheatControlAttributes(Definitions):
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
|
||||
|
||||
|
||||
class RoofAttributes(Definitions):
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import re
|
||||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import (
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import (
|
||||
extract_component_types,
|
||||
extract_thermal_transmittance
|
||||
)
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from typing import Dict, Union
|
||||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||
|
||||
|
||||
class WindowAttributes(Definitions):
|
||||
21
etl/epc_clean/epc_attributes/all_cleaners.py
Normal file
21
etl/epc_clean/epc_attributes/all_cleaners.py
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||
|
||||
all_cleaner_map = {
|
||||
'floor-description': FloorAttributes,
|
||||
'hotwater-description': HotWaterAttributes,
|
||||
'main-fuel': MainFuelAttributes,
|
||||
'mainheat-description': MainHeatAttributes,
|
||||
'mainheatcont-description': MainheatControlAttributes,
|
||||
'roof-description': RoofAttributes,
|
||||
'walls-description': WallAttributes,
|
||||
'windows-description': WindowAttributes,
|
||||
'lighting-description:': LightingAttributes,
|
||||
}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
import model_data.epc_attributes.attribute_utils as attribute_utils
|
||||
import etl.epc_clean.epc_attributes.attribute_utils as attribute_utils
|
||||
|
||||
|
||||
def test_extract_thermal_transmittance():
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
import pickle
|
||||
from model_data.EpcClean import EpcClean
|
||||
from etl.epc_clean.EpcClean import EpcClean
|
||||
from pathlib import Path
|
||||
|
||||
# For local testing
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.tests.test_data.test_floor_attributes_cases import clean_floor_cases
|
||||
from model_data.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.tests.test_data.test_floor_attributes_cases import clean_floor_cases
|
||||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
|
||||
|
||||
class TestCleanFloor:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from model_data.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
|
||||
|
||||
|
||||
class TestHotWaterAttributes:
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import pandas as pd
|
||||
import pytest
|
||||
from model_data.tests.test_data.test_lighting_attributes_cases import test_cases
|
||||
from model_data.epc_attributes.LightingAttributes import LightingAttributes
|
||||
from etl.epc_clean.tests.test_data.test_lighting_attributes_cases import test_cases
|
||||
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||
|
||||
# An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
|
||||
# value is the expected proportion.
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from model_data.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from etl.epc_clean.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
|
||||
|
||||
|
||||
class TestMainHeatControlAttributes:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from model_data.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
|
||||
|
||||
|
||||
class TestMainHeatAttributes:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from model_data.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
|
||||
|
||||
|
||||
class TestMainHeatControlAttributes:
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from pathlib import Path
|
||||
from model_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
|
||||
from model_data.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
|
||||
# For local testing
|
||||
if __file__ == "<input>":
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
from model_data.utils import is_percentage_or_number, correct_spelling
|
||||
from etl.epc_clean.utils import is_percentage_or_number, correct_spelling
|
||||
|
||||
|
||||
class TestUtils:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.WallAttributes import WallAttributes
|
||||
from model_data.tests.test_data.test_wall_attributes_cases import wall_cases
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.tests.test_data.test_wall_attributes_cases import wall_cases
|
||||
|
||||
|
||||
class TestWallAttributes:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pytest
|
||||
from model_data.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from model_data.tests.test_data.test_window_attributes_cases import windows_cases
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.tests.test_data.test_window_attributes_cases import windows_cases
|
||||
|
||||
|
||||
class TestWindowAttributes:
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import pandas as pd
|
||||
from unittest.mock import patch, call
|
||||
from model_data.LandRegistryClient import LandRegistryClient
|
||||
from etl.land_registry.LandRegistryClient import LandRegistryClient
|
||||
|
||||
|
||||
class TestLandRegistryClient:
|
||||
54
etl/property_dimensions/app.py
Normal file
54
etl/property_dimensions/app.py
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
This is a simple application which estimates some of the basic dimensions of a property based on EPC
|
||||
data which we can use as a proxy value if we don't have this information on the EPC
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from BaseUtility import Definitions
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||
|
||||
GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
|
||||
|
||||
BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
|
||||
|
||||
|
||||
def app():
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
for directory in tqdm(directories):
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
data = data[~pd.isnull(data["UPRN"])]
|
||||
data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
|
||||
|
||||
data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
|
||||
lambda x: DataProcessor.clean_construction_age_band(x)
|
||||
)
|
||||
data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
|
||||
data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
|
||||
data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
|
||||
data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
|
||||
data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
|
||||
|
||||
df = (
|
||||
data.groupby(GROUPBY)
|
||||
.agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"})
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
local_authority = data["LOCAL_AUTHORITY"].unique()
|
||||
if len(local_authority) > 1:
|
||||
raise Exception("More than one la in data")
|
||||
local_authority = local_authority[0]
|
||||
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=df,
|
||||
bucket_name=BUCKET,
|
||||
file_key=f"property_dimensions/{local_authority}.parquet",
|
||||
)
|
||||
|
|
@ -56,7 +56,7 @@ class BoreholeClient:
|
|||
|
||||
# EXAMPLE
|
||||
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
|
||||
# entries in here if possible before we produce any form of comparison between our properties, to infer
|
||||
# entries in here if possible before we produce any form of comparison between our epc, to infer
|
||||
# the distance from the property to the nearest borehole
|
||||
|
||||
# Let's take a sample
|
||||
|
|
@ -1,12 +1,55 @@
|
|||
from enum import Enum
|
||||
import boto3
|
||||
import os
|
||||
import tempfile
|
||||
import geopandas as gpd
|
||||
import numpy as np
|
||||
from enum import Enum
|
||||
from shapely.geometry import Point
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_io_from_s3
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def read_shapefile_from_s3(bucket_name, s3_file_key):
|
||||
"""
|
||||
Read a shapefile from S3 into a GeoDataFrame.
|
||||
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_key: The file path of the shape file
|
||||
:return: GeoDataFrame containing the shapefile data
|
||||
"""
|
||||
|
||||
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
|
||||
shape_file_key = s3_file_key.split("/")[-1]
|
||||
# Create a temporary directory
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
s3_client = boto3.client('s3')
|
||||
|
||||
# Ensure the temporary directory exists
|
||||
logger.info("Creating temporary directory at %s" % tmpdirname)
|
||||
os.makedirs(tmpdirname, exist_ok=True)
|
||||
|
||||
# List all files in the given S3 folder
|
||||
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
|
||||
|
||||
# Download each file to the temporary directory
|
||||
for s3_object in s3_objects:
|
||||
file_key = s3_object['Key']
|
||||
file_name = os.path.basename(file_key)
|
||||
local_file_path = os.path.join(tmpdirname, file_name)
|
||||
# Explicitly create the temporary file
|
||||
with open(local_file_path, 'wb') as tmpfile:
|
||||
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
|
||||
|
||||
# Read the shapefile from the temporary directory into a GeoDataFrame
|
||||
shapefile_path = os.path.join(tmpdirname, shape_file_key)
|
||||
gdf = gpd.read_file(shapefile_path)
|
||||
|
||||
return gdf
|
||||
|
||||
|
||||
class ConservationAreaClient:
|
||||
"""
|
||||
Class to interact and manupulate convervation area data. The historic england data
|
||||
|
|
@ -18,13 +61,14 @@ class ConservationAreaClient:
|
|||
"""
|
||||
|
||||
SOURCES = ["historic_england"]
|
||||
IN_CONSERVATION_AREA = "in_conservation_area"
|
||||
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
|
||||
UNKNOWN = "unknown"
|
||||
IN_CONSERVATION_AREA = True
|
||||
NOT_IN_CONSERVATION_AREA = False
|
||||
UNKNOWN = None
|
||||
|
||||
def __init__(self, historic_england_path, gov_path):
|
||||
def __init__(self, historic_england_path, gov_path, bucket):
|
||||
self.historic_england_path = historic_england_path
|
||||
self.gov_path = gov_path
|
||||
self.bucket = bucket
|
||||
|
||||
self.historic_england_data = None
|
||||
self.gov_data = None
|
||||
|
|
@ -34,11 +78,21 @@ class ConservationAreaClient:
|
|||
Read the data
|
||||
"""
|
||||
logger.info("Reading in historic england conservation area shapefile")
|
||||
self.historic_england_data = gpd.read_file(self.historic_england_path)
|
||||
self.historic_england_data = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_path
|
||||
)
|
||||
|
||||
logger.info("Reading in Govenment conservation area geojson")
|
||||
self.gov_data = gpd.read_file(self.gov_path)
|
||||
|
||||
self.gov_data = gpd.read_file(
|
||||
read_io_from_s3(
|
||||
bucket_name=self.bucket,
|
||||
file_key=self.gov_path
|
||||
)
|
||||
)
|
||||
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
||||
# Convert the gov data to british national grid co-ordinates
|
||||
self.gov_data = self.gov_data.to_crs("EPSG:27700")
|
||||
|
||||
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
||||
|
||||
|
|
@ -71,6 +125,43 @@ class ConservationAreaClient:
|
|||
else:
|
||||
return ConservationAreaClient.UNKNOWN
|
||||
|
||||
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
|
||||
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
|
||||
|
||||
# Identify where we have definitive information (not "unknown")
|
||||
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
|
||||
joined_gdf_he["NAME"] != "No data available for publication by HE"
|
||||
)
|
||||
|
||||
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
|
||||
# The right index will be missing when we don't have a match so the uprn is not in a conservation
|
||||
# area
|
||||
uprn_not_in_conservation_he = joined_gdf_he.loc[
|
||||
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
|
||||
"UPRN"
|
||||
].unique()
|
||||
|
||||
# For unknowns, check against government data
|
||||
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
|
||||
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
|
||||
|
||||
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
|
||||
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
|
||||
|
||||
uprn_gdf['conservation_status'] = self.UNKNOWN
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
|
||||
] = self.IN_CONSERVATION_AREA
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
|
||||
] = self.NOT_IN_CONSERVATION_AREA
|
||||
uprn_gdf.loc[
|
||||
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
|
||||
] = self.IN_CONSERVATION_AREA
|
||||
|
||||
return uprn_gdf
|
||||
|
||||
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
|
||||
"""
|
||||
Check if a property is in a conservation area
|
||||
118
etl/spatial/OpenUprnClient.py
Normal file
118
etl/spatial/OpenUprnClient.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import geopandas as gpd
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class OpenUprnClient:
|
||||
"""
|
||||
|
||||
This client reads in the Open UPRN data from s3 which can be downloaded from here:
|
||||
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||
|
||||
This dataset contains a lookup of UPRNs to coordinates.
|
||||
|
||||
Specs for this dataset can be found here:
|
||||
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, path, bucket, uprns=None):
|
||||
self.path = path
|
||||
self.bucket = bucket
|
||||
self.uprns = [int(x) for x in uprns] if uprns else None
|
||||
self.data = None
|
||||
|
||||
# This will be stored in S3 and will be the complete list of filenames
|
||||
# We'll then use this to determine which file the UPRN's data is contained in
|
||||
self.filenames = None
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
This methodology is placeholder, while data sits localls
|
||||
:return:
|
||||
"""
|
||||
logger.info("Reading in open uprn data")
|
||||
|
||||
df = pd.read_csv(
|
||||
read_io_from_s3(
|
||||
bucket_name=self.bucket,
|
||||
file_key=self.path
|
||||
)
|
||||
)
|
||||
if self.uprns:
|
||||
df = df[df["UPRN"].isin(self.uprns)]
|
||||
|
||||
self.data = df
|
||||
|
||||
def read_local(self):
|
||||
"""
|
||||
For local testing
|
||||
:return:
|
||||
"""
|
||||
logger.info("Reading in open uprn data")
|
||||
|
||||
df = pd.read_csv(self.path)
|
||||
if self.uprns:
|
||||
df = df[df["UPRN"].isin(self.uprns)]
|
||||
|
||||
self.data = df
|
||||
|
||||
def create_file_partitions(self, partition_size=50000):
|
||||
logger.info("Sorting data by UPRN ascending")
|
||||
self.data = self.data.sort_values("UPRN", ascending=True)
|
||||
|
||||
logger.info("Creating partitions")
|
||||
self.data['partition'] = self.data.index // partition_size
|
||||
|
||||
self.filenames = {}
|
||||
for partition, group in tqdm(self.data.groupby('partition')):
|
||||
min_uprn = group['UPRN'].min()
|
||||
max_uprn = group['UPRN'].max()
|
||||
self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
|
||||
|
||||
self.data['filename'] = self.data['partition'].map(self.filenames)
|
||||
|
||||
@staticmethod
|
||||
def find_filename_for_uprn(uprn, filenames):
|
||||
for filename in filenames:
|
||||
min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
|
||||
if min_uprn <= uprn <= max_uprn:
|
||||
return filename
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def convert_bng_data_to_gpd(df):
|
||||
|
||||
gpd_data = gpd.GeoDataFrame(
|
||||
df,
|
||||
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
|
||||
crs="EPSG:27700" # British National Grid
|
||||
)
|
||||
|
||||
return gpd_data
|
||||
|
||||
def save_filenames_to_s3(self, bucket_name):
|
||||
"""
|
||||
Save the filenames to s3
|
||||
:param bucket_name:
|
||||
:return:
|
||||
"""
|
||||
file_key = os.path.join("spatial", "filename_meta.parquet")
|
||||
|
||||
filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
|
||||
filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
|
||||
'(\d+)_(\d+)'
|
||||
)
|
||||
filenames['lower'] = filenames['lower'].astype(int)
|
||||
filenames['upper'] = filenames['upper'].astype(int)
|
||||
|
||||
logger.info("Saving filenames to s3 at {}".format(file_key))
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=filenames,
|
||||
file_key=file_key,
|
||||
bucket_name=bucket_name
|
||||
)
|
||||
48
etl/spatial/README.md
Normal file
48
etl/spatial/README.md
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
# Spatial - Geospatial Data Processing Service
|
||||
|
||||
## Overview
|
||||
|
||||
The Spatial service is designed to read, process, and analyze geospatial data related to
|
||||
conservation areas and special buildings. It uses datasets from Historic England and the
|
||||
UK government to determine whether a given UPRN (Unique Property Reference Number) is within
|
||||
a conservation area or is a listed building. The processed data is saved back to an S3 bucket
|
||||
in a parquet format for easy retrieval and further analysis.
|
||||
|
||||
## Dependencies
|
||||
|
||||
Dependencies are listed in requirements.txt. To install them, run:
|
||||
|
||||
```
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Data Sources
|
||||
|
||||
1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
|
||||
2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
|
||||
3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
|
||||
4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
|
||||
5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
|
||||
|
||||
## Files
|
||||
|
||||
- app.py: Main application file that orchestrates the data processing flow.
|
||||
- ConservationAreaClient.py: Handles reading and processing of conservation area data.
|
||||
- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
|
||||
- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
|
||||
- requirements.txt: Lists all Python package dependencies.
|
||||
|
||||
## How to Run
|
||||
|
||||
1. Make sure you have all the required packages installed.
|
||||
2. Update the S3 bucket and file path constants in app.py.
|
||||
3. Run app.py.
|
||||
|
||||
## Workflow
|
||||
|
||||
1. Read the datasets for conservation areas and special buildings.
|
||||
2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
|
||||
3. For each partition:
|
||||
- Convert UPRN data to geopandas DataFrame.
|
||||
- Check if each UPRN is within a conservation area or is a special building.
|
||||
- Save the processed data back to S3 in parquet format.
|
||||
114
etl/spatial/SpecialBuildingsClient.py
Normal file
114
etl/spatial/SpecialBuildingsClient.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import geopandas as gpd
|
||||
from shapely.geometry import Point
|
||||
from utils.logger import setup_logger
|
||||
from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
|
||||
from datatypes.datatypes import OpenUprnCoordinateData
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class SpecialBuildingsClient:
|
||||
"""
|
||||
This class reads in data from Historic England, which can be used to determine if specific buildings are
|
||||
listed or heritage buildings
|
||||
"""
|
||||
|
||||
def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
|
||||
self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
|
||||
self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
|
||||
self.bucket = bucket
|
||||
|
||||
self.historic_england_listed_buildings = None
|
||||
self.historic_england_heritage_buildings = None
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
Read the data
|
||||
"""
|
||||
logger.info("Reading in historic england listed buildings shapefile")
|
||||
self.historic_england_listed_buildings = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
|
||||
)
|
||||
|
||||
logger.info("Reading in historic england heritage buildings shapefile")
|
||||
self.historic_england_heritage_buildings = read_shapefile_from_s3(
|
||||
bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
|
||||
)
|
||||
|
||||
# Convert the gov data to british national grid co-ordinates
|
||||
self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
|
||||
|
||||
def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||
"""
|
||||
Check if a location specified by British National Grid coordinates is a listed building.
|
||||
|
||||
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||
:return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
|
||||
"""
|
||||
# Convert the coordinates to a Shapely Point object
|
||||
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||
|
||||
# Check if the point is within any of the listed building polygons
|
||||
within_listed_buildings = self.historic_england_listed_buildings.contains(point)
|
||||
|
||||
if within_listed_buildings.any():
|
||||
# If the point is within any listed building polygon, log the names of the buildings and return
|
||||
# "listed_building"
|
||||
names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
|
||||
logger.info(f"The location is within the following listed buildings: {names.values}")
|
||||
return True
|
||||
|
||||
# If the point is not within any listed building polygon, return "not_listed_building"
|
||||
return False
|
||||
|
||||
def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
# Check against historic England listed buildings data
|
||||
joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
|
||||
|
||||
# Identify where we have matches
|
||||
uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
|
||||
|
||||
# Populate the results in the input GeoDataFrame
|
||||
uprn_gdf['is_listed_building'] = False
|
||||
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
|
||||
|
||||
return uprn_gdf
|
||||
|
||||
def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||
"""
|
||||
Check if a location specified by British National Grid coordinates is a heritage building at risk.
|
||||
|
||||
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||
:return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
|
||||
"not_heritage_building_at_risk" otherwise
|
||||
"""
|
||||
# Convert the coordinates to a Shapely Point object
|
||||
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||
|
||||
# Check if the point is within any of the heritage building at risk polygons
|
||||
within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
|
||||
|
||||
if within_heritage_buildings_at_risk.any():
|
||||
# If the point is within any heritage building at risk polygon, log the names of the buildings and return
|
||||
# "heritage_building_at_risk"
|
||||
names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
|
||||
logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
|
||||
return True
|
||||
|
||||
# If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
|
||||
return False
|
||||
|
||||
def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||
# Check against historic England heritage buildings data
|
||||
joined_gdf_heritage = gpd.sjoin(
|
||||
uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
|
||||
)
|
||||
|
||||
# Identify where we have matches
|
||||
uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
|
||||
|
||||
# Populate the results in the input GeoDataFrame
|
||||
uprn_gdf['is_heritage_building'] = False
|
||||
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
|
||||
|
||||
return uprn_gdf
|
||||
0
etl/spatial/__init__.py
Normal file
0
etl/spatial/__init__.py
Normal file
103
etl/spatial/app.py
Normal file
103
etl/spatial/app.py
Normal file
|
|
@ -0,0 +1,103 @@
|
|||
"""
|
||||
This application reads in the open uprn data from a static location and loads it into
|
||||
our database for querying from other services
|
||||
"""
|
||||
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
||||
from etl.spatial.OpenUprnClient import OpenUprnClient
|
||||
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
|
||||
BUCKET = "retrofit-datalake-dev"
|
||||
OUTPUT_BUCKET = "retrofit-data-dev"
|
||||
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
||||
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
|
||||
"NHLE)/Listed_Building_polygons.shp"
|
||||
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
|
||||
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This application uses the conservation area datasets to determine if a UPRN is
|
||||
in a conservation area or now
|
||||
|
||||
We use two sources of data for determining if homes are in conservation areas.
|
||||
The first is the Historic England dataset, which is a shapefile containing
|
||||
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
||||
geojson file containing polygons of conservation areas.
|
||||
|
||||
The Historic England dataset can be found here:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The listed building dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The hertitige buildings dataset is also found at Historic England at:
|
||||
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||
|
||||
The Gov.uk dataset can be found here:
|
||||
https://www.planning.data.gov.uk/dataset/conservation-area
|
||||
|
||||
The open UPRN data can be found here:
|
||||
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||
|
||||
The Office for National Statistics Postcode Lookup can be found here:
|
||||
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
|
||||
|
||||
For the moment, these data sources are downloaded manually and uploaded to S3.
|
||||
This application then processes those files and writes the results to s3
|
||||
"""
|
||||
|
||||
conservation_area_client = ConservationAreaClient(
|
||||
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
|
||||
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
conservation_area_client.read()
|
||||
|
||||
special_buildings_client = SpecialBuildingsClient(
|
||||
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
|
||||
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
special_buildings_client.read()
|
||||
|
||||
open_uprn_client = OpenUprnClient(
|
||||
path=OPEN_UPRN_PATHNAME,
|
||||
bucket=BUCKET
|
||||
)
|
||||
open_uprn_client.read()
|
||||
|
||||
# We want to sort the data and split it into filenames on UPRN.
|
||||
# We'll split the data into chunks of 50,000
|
||||
open_uprn_client.create_file_partitions()
|
||||
|
||||
logger.info("Extracting spatial data for uprn partitions")
|
||||
to_loop_over = open_uprn_client.data.groupby("filename")
|
||||
|
||||
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
|
||||
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
|
||||
|
||||
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
|
||||
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
|
||||
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
|
||||
|
||||
# Convert back to a regular dataframe
|
||||
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
|
||||
uprn_gdf = pd.DataFrame(uprn_gdf)
|
||||
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
||||
)
|
||||
|
||||
# We finally save the filesnames to s3
|
||||
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
import pytest
|
||||
from model_data.BoreholeClient import BoreholeClient
|
||||
from etl.spatial.BoreholeClient import BoreholeClient
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
0
etl/wall_area/__init__.py
Normal file
0
etl/wall_area/__init__.py
Normal file
|
|
@ -1,5 +1,5 @@
|
|||
"""
|
||||
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
|
||||
This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
|
||||
of insulation measures within homes
|
||||
"""
|
||||
import os
|
||||
|
|
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
|
|||
publicly_accessible = true
|
||||
}
|
||||
|
||||
# Set up the bucket that recieve the csv uploads of properties to be retrofit
|
||||
# Set up the bucket that recieve the csv uploads of epc to be retrofit
|
||||
module "s3_presignable_bucket" {
|
||||
source = "./modules/s3_presignable_bucket"
|
||||
bucketname = "retrofit-plan-inputs-${var.stage}"
|
||||
|
|
|
|||
12
input_property_list.csv
Normal file
12
input_property_list.csv
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
address,postcode,Notes,,,,
|
||||
28 Distillery Wharf,W6 9bf,,,,,
|
||||
Flat 14 Godley V C House,E2 0LP,,,,,
|
||||
49 Elderfield Road,E5 0LF,,,,,
|
||||
26 Stanhope Road,N6 5NG,,,,,
|
||||
Flat 3 Frederick Building,N1 4BD,,,,,
|
||||
Flat 4 Frederick Building,N1 4BD,,,,,
|
||||
"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
|
||||
"Flat 39, 239 Long Lane",SE1 4PT,,,,,
|
||||
"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
|
||||
"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
|
||||
88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
# Environment setup
|
||||
|
||||
We're using conda to manage environments to circumvent the
|
||||
issues with Mac M1. This documentation will also cover Pycharm setup.
|
||||
|
||||
We're working in python 3.10 so
|
||||
|
||||
```commandline
|
||||
conda create -n hestia-data python=3.10
|
||||
```
|
||||
|
||||
Then activate the environment
|
||||
|
||||
```commandline
|
||||
conda activate hestia-data
|
||||
```
|
||||
|
||||
To set up with Pycharm, run
|
||||
|
||||
```commandline
|
||||
which python
|
||||
```
|
||||
|
||||
and grab the path to the python executable. Then in Pycharm, go to
|
||||
Settings > Project > Python Interpreter and click the gear icon
|
||||
to add a new interpreter. Select Conda and either paste the path to the python executable
|
||||
and click OK, or select the conda environment from the dropdown.
|
||||
|
||||
You may need to restart Pycharm for the new interpreter to be recognised.
|
||||
|
||||
To install project dependencies navigate to /model_data and run
|
||||
|
||||
```commandline
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
|
||||
If you are not in a virtual environment, activate it with
|
||||
|
||||
```commandline
|
||||
conda activate envName
|
||||
```
|
||||
|
||||
Then run
|
||||
|
||||
```commandline
|
||||
pytest --cov-config=model_data/.coveragerc --cov=model_data
|
||||
```
|
||||
|
|
@ -1,650 +0,0 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
import matplotlib.pyplot as plt
|
||||
from typing import Dict, Optional, List
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.inspection import permutation_importance
|
||||
from model_data.EpcClean import EpcClean
|
||||
|
||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
||||
from tqdm import tqdm
|
||||
from utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class SapModel:
|
||||
# We want to estimate for making improvements on different property components
|
||||
RESPONSE = "current-energy-efficiency"
|
||||
# We could potentially build models by constituency to avoid having too many
|
||||
# features in the model
|
||||
BASE_FEATURES = [
|
||||
"property-type",
|
||||
"built-form",
|
||||
"construction-age-band",
|
||||
"number-habitable-rooms",
|
||||
"constituency",
|
||||
"number-heated-rooms",
|
||||
"transaction-type"
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
"walls-description",
|
||||
"floor-description",
|
||||
"lighting-description",
|
||||
"roof-description",
|
||||
"mainheat-description",
|
||||
"hotwater-description",
|
||||
"main-fuel",
|
||||
"mechanical-ventilation",
|
||||
"secondheat-description",
|
||||
"energy-tariff",
|
||||
"solar-water-heating-flag",
|
||||
"photo-supply",
|
||||
"windows-description",
|
||||
"glazed-type",
|
||||
"glazed-area",
|
||||
"multi-glaze-proportion",
|
||||
# "lighting-description" # Might not need to use this
|
||||
"low-energy-lighting",
|
||||
"number-open-fireplaces",
|
||||
"mainheatcont-description",
|
||||
"fixed-lighting-outlets-count",
|
||||
"floor-height",
|
||||
"floor-level",
|
||||
"total-floor-area",
|
||||
"extension-count",
|
||||
]
|
||||
|
||||
CATEGORICAL_COLS = [
|
||||
"property-type",
|
||||
"built-form",
|
||||
"number-habitable-rooms",
|
||||
"constituency",
|
||||
"number-heated-rooms",
|
||||
"mainheat-description",
|
||||
"hotwater-description",
|
||||
"main-fuel",
|
||||
"mechanical-ventilation",
|
||||
"secondheat-description",
|
||||
"energy-tariff",
|
||||
"solar-water-heating-flag",
|
||||
"windows-description",
|
||||
"glazed-type",
|
||||
"glazed-area",
|
||||
"construction-age-band",
|
||||
"lighting-description",
|
||||
"mainheatcont-description",
|
||||
"floor-level",
|
||||
]
|
||||
|
||||
NUMERICAL_COLUMNS = [
|
||||
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
|
||||
"fixed-lighting-outlets-count",
|
||||
"floor-height",
|
||||
"total-floor-area",
|
||||
"extension-count",
|
||||
]
|
||||
|
||||
# For the moment, we store records of the best performing models as a benchmark for future imporvements
|
||||
BEST_FIT = {
|
||||
'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
|
||||
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
|
||||
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
|
||||
}
|
||||
|
||||
BEST_PREDICT = {
|
||||
'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
|
||||
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
|
||||
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
|
||||
}
|
||||
|
||||
BEST_FINAL = {
|
||||
'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
|
||||
'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
|
||||
'Median Absolute Error': 1.9487883489495985
|
||||
}
|
||||
|
||||
BUCKET_VARIABLES = [
|
||||
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self, data: List[Dict],
|
||||
cleaner: EpcClean,
|
||||
test_size: Optional[float] = 0.2,
|
||||
random_state: Optional[int] = None
|
||||
):
|
||||
self.df = pd.DataFrame(data)
|
||||
self.cleaner = cleaner
|
||||
self.random_state = random_state if random_state is not None else 42
|
||||
self.test_size = 0.2 if test_size is None else test_size
|
||||
|
||||
self.model_data = None
|
||||
self.train_x = None
|
||||
self.train_y = None
|
||||
self.test_x = None
|
||||
self.test_y = None
|
||||
|
||||
self.test_model = None
|
||||
self.final_model = None
|
||||
|
||||
self.fit_error = None
|
||||
self.predict_error = None
|
||||
self.final_error = None
|
||||
self.worst = {
|
||||
"fit_errors": pd.DataFrame(),
|
||||
"prediction_errors": pd.DataFrame(),
|
||||
"fit_x": pd.DataFrame(),
|
||||
"prediction_x": pd.DataFrame(),
|
||||
"final_errors": pd.DataFrame(),
|
||||
"final_x": pd.DataFrame(),
|
||||
}
|
||||
|
||||
self.fit_df = None
|
||||
self.predict_df = None
|
||||
self.final_fit_df = None
|
||||
self.diagnosis = {}
|
||||
|
||||
def run(self, plot: bool = False) -> None:
|
||||
"""
|
||||
A pipeline method to run all necessary methods in correct order.
|
||||
:param plot: Boolean to indicate whether to plot the regression
|
||||
"""
|
||||
try:
|
||||
self.create_dataset()
|
||||
self.fit_model()
|
||||
if plot:
|
||||
self.plot_regression(self.fit_df)
|
||||
except Exception as e:
|
||||
logger.error("An error occurred during execution.")
|
||||
logger.error(str(e))
|
||||
|
||||
def _merge_with_u_values(
|
||||
self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
|
||||
) -> pd.DataFrame:
|
||||
|
||||
"""
|
||||
Utility function to merge u value data with model data
|
||||
:param model_data: Pandas dataframe which is the main modelling dataset
|
||||
:param description: Name of the description column for which we're merging u-values onto
|
||||
:param thermal_transmittance: Name of the thermal transmittance column
|
||||
:return:
|
||||
"""
|
||||
|
||||
u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
|
||||
["original_description", thermal_transmittance]].rename(
|
||||
columns={thermal_transmittance: f"{description}_u_value"}
|
||||
)
|
||||
|
||||
model_data = model_data.merge(
|
||||
u_values,
|
||||
how="left",
|
||||
left_on=f"{description}-description",
|
||||
right_on="original_description"
|
||||
).drop(columns=["original_description"])
|
||||
|
||||
return model_data
|
||||
|
||||
def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Appends cleaned data into the model data.
|
||||
:param model_data: Original model data.
|
||||
:return: Model data with cleaned data appended.
|
||||
"""
|
||||
for description in ["walls", "floor", "roof"]:
|
||||
model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
|
||||
|
||||
# lighting_proportions added separately as it doesn't use the _merge_with_u_values method
|
||||
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
|
||||
["original_description", "low_energy_proportion"]]
|
||||
|
||||
model_data = model_data.merge(
|
||||
lighting_proportions,
|
||||
how="left",
|
||||
left_on="lighting-description",
|
||||
right_on="original_description"
|
||||
).drop(columns=["original_description"])
|
||||
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Converts transaction type to boolean
|
||||
:param model_data: Model data with transaction type.
|
||||
:return: Model data with converted transaction type.
|
||||
"""
|
||||
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
|
||||
model_data = model_data.drop(columns=["transaction-type"])
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame:
|
||||
"""
|
||||
Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
|
||||
:param df: Dataframe of features to be binned
|
||||
:param column_name: Name of the column to be binned
|
||||
:param n_bins: Number of bins to use
|
||||
:return: Dataframe with binned column
|
||||
"""
|
||||
# Check if the column is numerical
|
||||
if np.issubdtype(df[column_name].dtype, np.number):
|
||||
# Create a new categorical column from numerical one by binning the data
|
||||
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
|
||||
# Replace missing data with "NO_RECORD"
|
||||
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
|
||||
df[column_name + "_bucket"] = np.where(
|
||||
df[column_name + "_bucket"] == "nan",
|
||||
"NO_RECORD",
|
||||
df[column_name + "_bucket"]
|
||||
)
|
||||
return df
|
||||
|
||||
def _clean_numericals(self, model_data):
|
||||
|
||||
# Try binning numericals
|
||||
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
|
||||
|
||||
for col in self.BUCKET_VARIABLES:
|
||||
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
||||
# If all values are missing, set all values to 0 - this column will get dropped
|
||||
if all(pd.isnull(model_data[col])):
|
||||
model_data[col + "_bucket"] = "NO_RECORD"
|
||||
continue
|
||||
model_data = self.bucket_and_fill(model_data, col)
|
||||
|
||||
# Replace the data with the binned version
|
||||
model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
|
||||
model_data = model_data.rename(
|
||||
columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
|
||||
)
|
||||
|
||||
# Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
|
||||
for col in remaining_numericals:
|
||||
model_data[col] = np.where(
|
||||
model_data[col] == "", "0", model_data[col]
|
||||
).astype(float)
|
||||
|
||||
return model_data
|
||||
|
||||
@staticmethod
|
||||
def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Fills categorical missing data with sensible values
|
||||
:param model_data: Original model data.
|
||||
:return: Model data with cleaned categorical data.
|
||||
"""
|
||||
|
||||
# Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
|
||||
# potentially
|
||||
# a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
|
||||
|
||||
model_data["mechanical-ventilation"] = np.where(
|
||||
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
|
||||
)
|
||||
|
||||
model_data["solar-water-heating-flag"] = np.where(
|
||||
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
|
||||
)
|
||||
|
||||
model_data["glazed-type"] = np.where(
|
||||
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
|
||||
)
|
||||
|
||||
model_data["glazed-area"] = np.where(
|
||||
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
|
||||
)
|
||||
|
||||
return model_data
|
||||
|
||||
def create_dataset(self):
|
||||
logger.info("Creating modelling dataset")
|
||||
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
||||
model_data = model_data.reset_index(drop=True)
|
||||
model_data["idx"] = model_data.index.copy()
|
||||
|
||||
# Append on u-values
|
||||
model_data = self._append_cleaned_data(model_data)
|
||||
|
||||
model_data = self.clean_missings(model_data)
|
||||
|
||||
# Convert transaction_type
|
||||
model_data = self._convert_transaction_type(model_data)
|
||||
|
||||
# Clean numerical columns
|
||||
model_data = self._clean_numericals(model_data)
|
||||
|
||||
# Take just entries with U-values
|
||||
# TODO: Rather than doing this, do we want to include the estimated u-values?
|
||||
# Since this ends up with just 2k entries
|
||||
model_data = model_data[
|
||||
~pd.isnull(model_data["walls_u_value"]) &
|
||||
~pd.isnull(model_data["floor_u_value"]) &
|
||||
~pd.isnull(model_data["roof_u_value"])
|
||||
]
|
||||
|
||||
exclude_features = [
|
||||
"walls-description", "floor-description", "roof-description", "transaction-type"
|
||||
]
|
||||
|
||||
features = [
|
||||
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
|
||||
] if x not in exclude_features
|
||||
]
|
||||
|
||||
model_data = model_data[features]
|
||||
|
||||
for col in self.CATEGORICAL_COLS:
|
||||
model_data[col] = model_data[col].astype('category')
|
||||
|
||||
# Convert response
|
||||
model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
|
||||
|
||||
self.model_data = model_data
|
||||
|
||||
def make_training_test(self, x):
|
||||
# Split into training and test
|
||||
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
|
||||
x.drop(self.RESPONSE, axis=1),
|
||||
x[self.RESPONSE],
|
||||
test_size=self.test_size,
|
||||
random_state=self.random_state
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
|
||||
"""
|
||||
Utility function to remove columns that have zero standard deviation from both test and train sets
|
||||
:param train_x: Training data to remove columns from
|
||||
:param test_x: If provided, remove the same columns from the test data
|
||||
:param threshold: float value, if the standard deviation is below this threshold, the column is considered
|
||||
to have zero standard deviation
|
||||
:return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
|
||||
"""
|
||||
# Compute standard deviations
|
||||
std_devs = train_x.std()
|
||||
|
||||
# Find columns with zero or near-zero standard deviation
|
||||
zero_std_cols = std_devs[std_devs <= threshold].index
|
||||
|
||||
# Drop these columns from the training data
|
||||
train_x = train_x.drop(zero_std_cols, axis=1)
|
||||
|
||||
if test_x is not None:
|
||||
# Ensure the test data has the same columns
|
||||
test_x = test_x[train_x.columns]
|
||||
return train_x, test_x
|
||||
|
||||
return train_x, None
|
||||
|
||||
def fit_model(self):
|
||||
"""
|
||||
Main function to fit the model and produce accuracy metrics
|
||||
"""
|
||||
|
||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
|
||||
|
||||
# Convert booleans to integer
|
||||
for col in x.columns:
|
||||
if x[col].dtype == bool:
|
||||
x[col] = x[col].astype(int)
|
||||
|
||||
if x[col].dtype == object:
|
||||
x[col] = x[col].astype(float)
|
||||
|
||||
# Create the training and test sets for each run
|
||||
self.make_training_test(x)
|
||||
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
|
||||
logger.info("Detecting multi-collinearity in training dataset")
|
||||
self.detect_multi_collinearity()
|
||||
|
||||
# Add a constant to the independent value
|
||||
train_x = sm.add_constant(self.train_x)
|
||||
test_x = sm.add_constant(self.test_x)
|
||||
train_idx = train_x["idx"].copy()
|
||||
test_idx = self.test_x["idx"].copy()
|
||||
train_x = train_x.drop(columns=["idx"])
|
||||
test_x = test_x.drop(columns=["idx"])
|
||||
|
||||
logger.info("Fitting testing model")
|
||||
# make regression model
|
||||
model = sm.OLS(self.train_y, train_x)
|
||||
# fit model and print results
|
||||
self.test_model = model.fit()
|
||||
|
||||
train_predictions = self.test_model.fittedvalues
|
||||
test_predictions = self.test_model.predict(test_x)
|
||||
|
||||
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.train_y, y_pred=train_predictions
|
||||
)
|
||||
|
||||
# Predict on new data
|
||||
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
|
||||
y_true=self.test_y, y_pred=test_predictions
|
||||
)
|
||||
|
||||
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
|
||||
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
|
||||
|
||||
self.model_data['fit'] = self.test_model.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
|
||||
self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
|
||||
|
||||
self.fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": train_predictions,
|
||||
"actual": self.train_y,
|
||||
"idx": train_idx
|
||||
}
|
||||
).sort_values("actual", ascending=True)
|
||||
|
||||
self.predict_df = pd.DataFrame(
|
||||
{
|
||||
"predictions": test_predictions,
|
||||
"actual": self.test_y,
|
||||
"idx": test_idx
|
||||
}
|
||||
)
|
||||
|
||||
self.diagnosis = {
|
||||
"fit_success": fit_success,
|
||||
"predict_success": predict_success,
|
||||
"summary": self.test_model.summary()
|
||||
}
|
||||
|
||||
# We're now ready to fit the final model
|
||||
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
|
||||
# just need to remove the columns that were removed from the training data from the final model
|
||||
logger.info("Fitting final model")
|
||||
x = sm.add_constant(x)
|
||||
y = x[self.RESPONSE]
|
||||
x = x[self.train_x.columns]
|
||||
idx = x["idx"].copy()
|
||||
x = x.drop(columns=["idx"])
|
||||
|
||||
final_model = sm.OLS(y, x)
|
||||
# fit model and print results
|
||||
self.final_model = final_model.fit()
|
||||
final_predictions = self.final_model.fittedvalues
|
||||
|
||||
self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
|
||||
y_true=y, y_pred=final_predictions
|
||||
)
|
||||
|
||||
self.final_fit_df = pd.DataFrame(
|
||||
{
|
||||
"fit": final_predictions,
|
||||
"actual": y,
|
||||
"idx": idx
|
||||
}
|
||||
).sort_values("actual", ascending=True)
|
||||
|
||||
@staticmethod
|
||||
def check_successes(experiment_error, best_error):
|
||||
"""
|
||||
Simple function to check if the experiment error is better than the best error
|
||||
:param experiment_error: output of calculate_regression_metrics() on the experiment
|
||||
:param best_error: Current benchmark best error
|
||||
:return:
|
||||
"""
|
||||
|
||||
successes = []
|
||||
for k in experiment_error:
|
||||
if k in ["Explained Variance Score", "R2 Score"]:
|
||||
# We want to maximise this so we want experiment error to be higher
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] >= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
continue
|
||||
successes.append(
|
||||
{
|
||||
"measure": k,
|
||||
"success": experiment_error[k] <= best_error[k],
|
||||
"difference": abs(experiment_error[k] - best_error[k])
|
||||
}
|
||||
)
|
||||
|
||||
return pd.DataFrame(successes)
|
||||
|
||||
def rf_importance(self, train_x, train_y, test_x, test_y):
|
||||
"""
|
||||
Utility function to estimate feature importance using a random forest
|
||||
This is useful to get a sense of some of the key features which are driving model
|
||||
performance
|
||||
|
||||
:param train_x: Training data covariates to build the importance model on
|
||||
:param train_y: Training data response to build the importance model on
|
||||
:param test_x: Test data covariates to build the permutation importance model on
|
||||
:param test_y: Test data response to build the permutation importance model on
|
||||
:return: Pandas dataframe of feature importances, ranked by most important to least
|
||||
"""
|
||||
|
||||
rf = RandomForestRegressor(random_state=self.random_state)
|
||||
rf.fit(train_x, train_y)
|
||||
|
||||
# Print the name and importance of each feature
|
||||
rf_importance_df = []
|
||||
for feature, importance in zip(train_x.columns, rf.feature_importances_):
|
||||
rf_importance_df.append(
|
||||
{
|
||||
"Feature": feature,
|
||||
"rf_importance": importance
|
||||
}
|
||||
)
|
||||
rf_importance_df = pd.DataFrame(rf_importance_df)
|
||||
rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
|
||||
|
||||
perm_importance = self.permuation_importance(rf, test_x, test_y)
|
||||
|
||||
return rf_importance_df, perm_importance
|
||||
|
||||
@staticmethod
|
||||
def permuation_importance(rf, test_x, test_y):
|
||||
"""
|
||||
Simple utility function to produce permutation importance for a given model\
|
||||
:param rf: Random forest model to calculate permutation importance for
|
||||
:param test_x: Test covariates to be used for permutation importance
|
||||
:param test_y: Test response to be used for permutation importance
|
||||
:return:
|
||||
"""
|
||||
perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
|
||||
perm_importance_df = pd.DataFrame(
|
||||
{
|
||||
"Feature": test_x.columns,
|
||||
"perm_importance": perm_importance.importances_mean
|
||||
}
|
||||
).sort_values(by="perm_importance", ascending=False)
|
||||
|
||||
return perm_importance_df
|
||||
|
||||
def detect_multi_collinearity(self):
|
||||
# Get the VIFs for each variable
|
||||
vifs = pd.DataFrame()
|
||||
vifs["features"] = self.train_x.columns
|
||||
vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
|
||||
|
||||
# Get the features with the highest VIF
|
||||
vifs = vifs.sort_values("vif", ascending=False)
|
||||
|
||||
# There are some features, we do not want to remove
|
||||
required_features = [
|
||||
"walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap"
|
||||
]
|
||||
|
||||
vifs = vifs[~vifs["features"].isin(required_features)]
|
||||
drop_vifs = vifs[np.isinf(vifs["vif"])]
|
||||
|
||||
# Acceptable drop variables:
|
||||
# main-fuel_Gas: mains gas
|
||||
# glazed-type_NO DATA!
|
||||
# glazed-area_NO DATA!
|
||||
|
||||
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
|
||||
self.test_x = self.test_x[self.train_x.columns]
|
||||
|
||||
@staticmethod
|
||||
def plot_regression(df):
|
||||
# Extract the "fit" and "actual" columns from the dataframe
|
||||
fit = df['fit']
|
||||
actual = df['actual']
|
||||
|
||||
# Create an array of x-values (assumed to be sequential integers)
|
||||
x = np.arange(len(df))
|
||||
|
||||
# Plot the fit and actual data
|
||||
plt.plot(x, fit, color='red', label='Fit')
|
||||
plt.plot(x, actual, color='blue', label='Actual')
|
||||
|
||||
# Set labels and title
|
||||
plt.xlabel('Index')
|
||||
plt.ylabel('Value')
|
||||
plt.title('Linear Regression - Fit vs Actual')
|
||||
|
||||
# Display legend
|
||||
plt.legend()
|
||||
|
||||
# Show the plot
|
||||
plt.show()
|
||||
|
||||
@staticmethod
|
||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||
"""
|
||||
Calculate the 5 most important accuracy metrics for regression.
|
||||
|
||||
Args:
|
||||
y_true (array-like): Array of true target values.
|
||||
y_pred (array-like): Array of predicted target values.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing the calculated metrics.
|
||||
"""
|
||||
metrics = {
|
||||
'MAPE': mean_absolute_percentage_error(y_true, y_pred),
|
||||
'Mean Squared Error': mean_squared_error(y_true, y_pred),
|
||||
'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
|
||||
'R2 Score': r2_score(y_true, y_pred),
|
||||
'Explained Variance Score': explained_variance_score(y_true, y_pred),
|
||||
'Median Absolute Error': median_absolute_error(y_true, y_pred)
|
||||
}
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
errors['Actual'] = y_pred
|
||||
errors['Residual'] = errors['Actual'] - errors['Fit']
|
||||
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
||||
|
||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
||||
|
||||
return metrics, worst_errors
|
||||
|
|
@ -1,207 +0,0 @@
|
|||
import pickle
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from model_data.EpcClean import EpcClean
|
||||
|
||||
|
||||
class UvalueEstimations:
|
||||
def __init__(self, data: list):
|
||||
"""
|
||||
Initialize the UvalueEstimations class.
|
||||
|
||||
:param data: The input data as a list of dictionaries, to be converted to a dataframe
|
||||
"""
|
||||
self.data = pd.DataFrame(data)
|
||||
self.walls = None
|
||||
self.walls_decile_data = {}
|
||||
self.roofs = None
|
||||
self.floors = None
|
||||
self.floors_decile_data = {}
|
||||
|
||||
def get_estimates(self, cleaner: EpcClean):
|
||||
"""
|
||||
Calculate U-value estimates for walls, roofs, and floors.
|
||||
|
||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
||||
"""
|
||||
self.set_walls(cleaner)
|
||||
self.set_roofs(cleaner)
|
||||
self.set_floors(cleaner)
|
||||
|
||||
def set_walls(self, cleaner: EpcClean):
|
||||
"""
|
||||
Set U-value estimates for walls.
|
||||
|
||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
||||
"""
|
||||
walls_columns = [
|
||||
"local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form",
|
||||
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
|
||||
]
|
||||
|
||||
walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")]
|
||||
|
||||
# Take just the columns we want
|
||||
walls_df = walls_df[walls_columns]
|
||||
walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float)
|
||||
|
||||
walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area")
|
||||
|
||||
# We now get the U-values
|
||||
walls_df = walls_df.merge(
|
||||
pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]],
|
||||
how="left",
|
||||
right_on="original_description",
|
||||
left_on="walls-description"
|
||||
)
|
||||
|
||||
u_value_summary = walls_df.groupby(
|
||||
[
|
||||
"local-authority",
|
||||
"property-type",
|
||||
"walls-energy-eff",
|
||||
"walls-env-eff",
|
||||
"built-form",
|
||||
"number-habitable-rooms",
|
||||
"number-heated-rooms",
|
||||
"total-floor-area_group"
|
||||
],
|
||||
observed=True
|
||||
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
|
||||
|
||||
u_value_summary.columns = [
|
||||
"local-authority",
|
||||
"property-type",
|
||||
"walls-energy-eff",
|
||||
"walls-env-eff",
|
||||
"built-form",
|
||||
"number-habitable-rooms",
|
||||
"number-heated-rooms",
|
||||
"total-floor-area_group",
|
||||
"median_thermal_transmittance",
|
||||
"n_samples"
|
||||
]
|
||||
|
||||
self.walls = u_value_summary
|
||||
self.walls_decile_data = {
|
||||
"decile_labels": decile_labels,
|
||||
"decile_boundaries": decile_boundaries
|
||||
}
|
||||
|
||||
def set_roofs(self, cleaner: EpcClean):
|
||||
"""
|
||||
Set U-value estimates for roofs.
|
||||
|
||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_floors(self, cleaner: EpcClean):
|
||||
"""
|
||||
Set U-value estimates for floors.
|
||||
|
||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
||||
"""
|
||||
floors_columns = [
|
||||
"local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff",
|
||||
"built-form",
|
||||
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
|
||||
]
|
||||
|
||||
floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")]
|
||||
|
||||
# Take just the columns we want
|
||||
floors_df = floors_df[floors_columns]
|
||||
floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float)
|
||||
|
||||
floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area")
|
||||
|
||||
# We now get the U-values
|
||||
floors_df = floors_df.merge(
|
||||
pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]],
|
||||
how="left",
|
||||
right_on="original_description",
|
||||
left_on="floor-description"
|
||||
)
|
||||
|
||||
u_value_summary = floors_df.groupby(
|
||||
[
|
||||
"local-authority",
|
||||
"property-type",
|
||||
"floor-energy-eff",
|
||||
"floor-env-eff",
|
||||
"built-form",
|
||||
"number-habitable-rooms",
|
||||
"number-heated-rooms",
|
||||
"total-floor-area_group"
|
||||
],
|
||||
observed=True
|
||||
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
|
||||
|
||||
u_value_summary.columns = [
|
||||
"local-authority",
|
||||
"property-type",
|
||||
"floor-energy-eff",
|
||||
"floor-env-eff",
|
||||
"built-form",
|
||||
"number-habitable-rooms",
|
||||
"number-heated-rooms",
|
||||
"total-floor-area_group",
|
||||
"median_thermal_transmittance",
|
||||
"n_samples"
|
||||
]
|
||||
|
||||
self.floors = u_value_summary
|
||||
self.floors_decile_data = {
|
||||
"decile_labels": decile_labels,
|
||||
"decile_boundaries": decile_boundaries
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list):
|
||||
"""
|
||||
Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles.
|
||||
|
||||
:param df: The input Pandas DataFrame.
|
||||
:param column: The column name to break into deciles.
|
||||
|
||||
:return: A tuple containing:
|
||||
- The DataFrame with the decile group column.
|
||||
- The list of decile labels.
|
||||
- The list of decile boundaries.
|
||||
"""
|
||||
# Calculate decile boundaries
|
||||
decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10))
|
||||
|
||||
# Create decile labels
|
||||
decile_labels = [f"Decile {i + 1}" for i in range(10)]
|
||||
|
||||
# Assign decile labels to existing values
|
||||
df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels,
|
||||
include_lowest=True)
|
||||
|
||||
return df, decile_labels, decile_boundaries
|
||||
|
||||
@staticmethod
|
||||
def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list:
|
||||
"""
|
||||
Classify new values into existing deciles based on decile definitions.
|
||||
|
||||
:param decile_boundaries: The list of decile boundaries.
|
||||
:param decile_labels: The list of decile labels.
|
||||
:param new_values: A list of new values to classify.
|
||||
|
||||
:return: The classifications for the new values as a list.
|
||||
"""
|
||||
# Classify new values based on decile definitions
|
||||
classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True)
|
||||
return classifications.tolist()
|
||||
|
||||
def _save(self, filename):
|
||||
"""
|
||||
Useful utility function to store this object, which is particularly handy for unit testing
|
||||
:return:
|
||||
"""
|
||||
with open(filename, 'wb') as f:
|
||||
pickle.dump(self, f)
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(dotenv_path='model_data/.env')
|
||||
|
||||
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
import time
|
||||
|
||||
|
||||
def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdown=0.1):
|
||||
offset_from = 0
|
||||
n_completed = 0
|
||||
results = []
|
||||
complete = False
|
||||
while not complete:
|
||||
if verbose:
|
||||
print("Pulling for page %s" % str(int(offset_from / page_size) + 1))
|
||||
time.sleep(slowdown)
|
||||
search_resp = client.domestic.search(params=params, offset_from=offset_from, size=page_size)
|
||||
|
||||
# Note: We can only make 10k queries for a single set of search queries.
|
||||
# It might make sense to download data via zip for machine learning since we don't need this
|
||||
# data to be perfectly up to date
|
||||
if not search_resp:
|
||||
break
|
||||
|
||||
n_completed += 1
|
||||
|
||||
results.extend(search_resp["rows"])
|
||||
if n_completed == n_pages:
|
||||
complete = True
|
||||
else:
|
||||
offset_from += page_size
|
||||
|
||||
return results
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None):
|
||||
"""
|
||||
Create a heatmap plot based on a list of data and given variables.
|
||||
|
||||
:param data: List of dictionaries, input data.
|
||||
:param response_var: String, response variable to be plotted.
|
||||
:param pivot_var1: String, first pivot variable to be used in the plot.
|
||||
:param pivot_var2: String, second pivot variable to be used in the plot.
|
||||
:param order1: List, the order of categories for pivot_var1. Optional.
|
||||
:param order2: List, the order of categories for pivot_var2. Optional.
|
||||
|
||||
Returns:
|
||||
None. Displays the generated plot.
|
||||
"""
|
||||
|
||||
# Create a DataFrame from your list of dictionaries
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Convert the response variable column to float type if it's not already
|
||||
df[response_var] = df[response_var].astype(float)
|
||||
|
||||
# Create a pivot table
|
||||
pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var)
|
||||
|
||||
# If an order is provided, reorder the pivot table
|
||||
if order1 is not None:
|
||||
pivot = pivot.reindex(order1)
|
||||
if order2 is not None:
|
||||
pivot = pivot[order2]
|
||||
|
||||
# Plot the heatmap
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm')
|
||||
plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}")
|
||||
plt.show()
|
||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue