mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
commit
b2142a7f8e
154 changed files with 1977 additions and 13742 deletions
10
.coveragerc
10
.coveragerc
|
|
@ -2,12 +2,8 @@
|
||||||
omit =
|
omit =
|
||||||
*__init__*
|
*__init__*
|
||||||
*/tests/*
|
*/tests/*
|
||||||
model_data/temp_inputs.py
|
|
||||||
model_data/config.py
|
|
||||||
model_data/__init__.py
|
|
||||||
model_data/app.py
|
|
||||||
model_data/plotting/*
|
|
||||||
recommendations/rdsap_tables.py
|
recommendations/rdsap_tables.py
|
||||||
model_data/simulation_system/*
|
*/config.py
|
||||||
model_data/cleaner_app.py
|
*/app.py
|
||||||
|
*/settings.py
|
||||||
backend/app/*
|
backend/app/*
|
||||||
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
81
.github/workflows/deploy_sap_model_lambda.yml
vendored
|
|
@ -1,81 +0,0 @@
|
||||||
name: Sap Model Deploy
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches: [ dev, prod ]
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
deploy:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v3
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: 3.10.12
|
|
||||||
|
|
||||||
- name: Install Serverless and plugins
|
|
||||||
run: |
|
|
||||||
npm install -g serverless
|
|
||||||
npm install -g serverless-domain-manager
|
|
||||||
|
|
||||||
- name: AWS credentials for dev
|
|
||||||
if: github.ref == 'refs/heads/dev'
|
|
||||||
uses: aws-actions/configure-aws-credentials@v1
|
|
||||||
with:
|
|
||||||
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
|
|
||||||
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
|
|
||||||
aws-region: eu-west-2
|
|
||||||
|
|
||||||
- name: AWS credentials for prod
|
|
||||||
if: github.ref == 'refs/heads/prod'
|
|
||||||
uses: aws-actions/configure-aws-credentials@v1
|
|
||||||
with:
|
|
||||||
aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
|
|
||||||
aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
|
|
||||||
aws-region: eu-west-2
|
|
||||||
|
|
||||||
- name: Set domain name
|
|
||||||
id: set_domain
|
|
||||||
run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
|
|
||||||
|
|
||||||
- name: Set ECR credentials
|
|
||||||
id: set_ecr_credentials
|
|
||||||
run: |
|
|
||||||
echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
|
|
||||||
|
|
||||||
- name: Setup Docker
|
|
||||||
uses: docker/setup-buildx-action@v1
|
|
||||||
|
|
||||||
- name: Login to ECR
|
|
||||||
run: |
|
|
||||||
aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
|
||||||
|
|
||||||
# Building and pushing Docker image with caching
|
|
||||||
- name: Build and push Docker image
|
|
||||||
uses: docker/build-push-action@v3
|
|
||||||
with:
|
|
||||||
context: ./model_data/simulation_system
|
|
||||||
file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
|
|
||||||
push: true
|
|
||||||
tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
platform: linux/amd64
|
|
||||||
provenance: false
|
|
||||||
|
|
||||||
- name: Deploy to AWS Lambda via Serverless
|
|
||||||
env:
|
|
||||||
RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
|
|
||||||
MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
|
|
||||||
PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
|
|
||||||
DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
|
|
||||||
DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
|
|
||||||
ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
|
|
||||||
GITHUB_SHA: ${{ github.sha }}
|
|
||||||
run: |
|
|
||||||
# Deploy to AWS Lambda via Serverless
|
|
||||||
sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose
|
|
||||||
6
.gitignore
vendored
6
.gitignore
vendored
|
|
@ -239,7 +239,8 @@ fabric.properties
|
||||||
.idea/caches/build_file_checksums.ser
|
.idea/caches/build_file_checksums.ser
|
||||||
|
|
||||||
# Locally stored data
|
# Locally stored data
|
||||||
/model_data/local_data/*
|
local_data/*
|
||||||
|
/local_data/*
|
||||||
|
|
||||||
*.DS_Store
|
*.DS_Store
|
||||||
infrastructure/terraform/.terraform*
|
infrastructure/terraform/.terraform*
|
||||||
|
|
@ -261,3 +262,6 @@ model_data/simulation_system/predictions/
|
||||||
|
|
||||||
.idea/Model.iml
|
.idea/Model.iml
|
||||||
.idea/misc.iml
|
.idea/misc.iml
|
||||||
|
|
||||||
|
adhoc
|
||||||
|
adhoc/*
|
||||||
9
.idea/Model.iml
generated
9
.idea/Model.iml
generated
|
|
@ -7,7 +7,14 @@
|
||||||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
|
<component name="PyNamespacePackagesService">
|
||||||
|
<option name="namespacePackageFolders">
|
||||||
|
<list>
|
||||||
|
<option value="$MODULE_DIR$/local_data" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
</module>
|
</module>
|
||||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||||
<option name="version" value="3" />
|
<option name="version" value="3" />
|
||||||
</component>
|
</component>
|
||||||
|
|
|
||||||
|
|
@ -43,7 +43,9 @@ class Definitions:
|
||||||
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
||||||
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
||||||
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
||||||
"NULL"
|
"NULL",
|
||||||
|
# We sometimes see fields populated with just an empty string.
|
||||||
|
""
|
||||||
}
|
}
|
||||||
|
|
||||||
DATA_ANOMALY_SUBSTRINGS = {
|
DATA_ANOMALY_SUBSTRINGS = {
|
||||||
|
|
@ -1,9 +1,22 @@
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from etl.epc.DataProcessor import DataProcessor
|
||||||
|
from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
|
||||||
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import read_dataframe_from_s3_parquet
|
||||||
from epc_api.client import EpcClient
|
from epc_api.client import EpcClient
|
||||||
from model_data.config import EPC_AUTH_TOKEN
|
from BaseUtility import Definitions
|
||||||
from model_data.BaseUtility import Definitions
|
|
||||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||||
|
from recommendations.recommendation_utils import estimate_floors, estimate_perimeter, get_wall_type, estimate_wall_area
|
||||||
|
|
||||||
|
ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev')
|
||||||
|
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
|
||||||
|
DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None)
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
class Property(Definitions):
|
class Property(Definitions):
|
||||||
|
|
@ -30,17 +43,27 @@ class Property(Definitions):
|
||||||
lighting = None
|
lighting = None
|
||||||
|
|
||||||
coordinates = None
|
coordinates = None
|
||||||
age_band = None
|
|
||||||
|
|
||||||
def __init__(self, id, postcode, address1, epc_client=None, data=None):
|
def __init__(self, id, postcode, address1, epc_client=None, data=None):
|
||||||
self.id = id
|
self.id = id
|
||||||
self.postcode = postcode
|
self.postcode = postcode
|
||||||
self.address1 = address1
|
self.address1 = address1
|
||||||
self.data = data
|
self.data = data
|
||||||
|
self.old_data = None
|
||||||
|
self.property_dimensions = None
|
||||||
|
|
||||||
|
self.uprn = None
|
||||||
self.full_sap_epc = None
|
self.full_sap_epc = None
|
||||||
self.in_conservation_area = None
|
self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
|
||||||
|
self.restricted_measures = False
|
||||||
self.year_built = None
|
self.year_built = None
|
||||||
self.number_of_rooms = None
|
self.number_of_rooms = None
|
||||||
|
self.age_band = None
|
||||||
|
self.construction_age_band = None
|
||||||
|
self.number_of_floors = None
|
||||||
|
self.perimeter = None
|
||||||
|
self.wall_type = None
|
||||||
|
self.floor_type = None
|
||||||
|
|
||||||
self.energy = None
|
self.energy = None
|
||||||
self.ventilation = None
|
self.ventilation = None
|
||||||
|
|
@ -83,9 +106,14 @@ class Property(Definitions):
|
||||||
]
|
]
|
||||||
if len(newest_response) > 1:
|
if len(newest_response) > 1:
|
||||||
raise Exception("More than one result found for this address - investigate me")
|
raise Exception("More than one result found for this address - investigate me")
|
||||||
|
|
||||||
|
# We'll keep old EPCs in case it contains information, not present on the newest one
|
||||||
|
self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]]
|
||||||
|
|
||||||
response["rows"] = newest_response
|
response["rows"] = newest_response
|
||||||
|
|
||||||
self.data = response["rows"][0]
|
self.data = response["rows"][0]
|
||||||
|
self.uprn = int(self.data["uprn"])
|
||||||
|
|
||||||
def set_coordinates(self, coordinates):
|
def set_coordinates(self, coordinates):
|
||||||
"""
|
"""
|
||||||
|
|
@ -127,7 +155,7 @@ class Property(Definitions):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
ventilation = self.data["mechanical-ventilation"]
|
ventilation = self.data["mechanical-ventilation"]
|
||||||
# perform some simple cleaning - when checking 300k properties, the only unique values were
|
# perform some simple cleaning - when checking 300k epc, the only unique values were
|
||||||
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
|
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
|
||||||
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
|
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
|
||||||
ventilation = None
|
ventilation = None
|
||||||
|
|
@ -145,7 +173,7 @@ class Property(Definitions):
|
||||||
- solar_pv
|
- solar_pv
|
||||||
This is based on the "photo-supply" field in the EPC data.
|
This is based on the "photo-supply" field in the EPC data.
|
||||||
|
|
||||||
When checking 100k properties, either the value was "" or a stringified number
|
When checking 100k epc, either the value was "" or a stringified number
|
||||||
"""
|
"""
|
||||||
|
|
||||||
solar_pv = self.data["photo-supply"]
|
solar_pv = self.data["photo-supply"]
|
||||||
|
|
@ -244,11 +272,10 @@ class Property(Definitions):
|
||||||
self.set_count_variables()
|
self.set_count_variables()
|
||||||
self.set_heat_loss_corridor()
|
self.set_heat_loss_corridor()
|
||||||
self.set_mains_gas()
|
self.set_mains_gas()
|
||||||
self.set_floor_height()
|
|
||||||
self.set_wall_area()
|
|
||||||
self.set_floor_area()
|
|
||||||
self.set_age_band()
|
self.set_age_band()
|
||||||
|
|
||||||
|
self.set_basic_property_dimensions()
|
||||||
|
|
||||||
for description, attribute in cleaned.items():
|
for description, attribute in cleaned.items():
|
||||||
|
|
||||||
if self.data[description] in self.DATA_ANOMALY_MATCHES:
|
if self.data[description] in self.DATA_ANOMALY_MATCHES:
|
||||||
|
|
@ -262,10 +289,19 @@ class Property(Definitions):
|
||||||
attributes = [
|
attributes = [
|
||||||
x for x in cleaned[description] if x["original_description"] == self.data[description]
|
x for x in cleaned[description] if x["original_description"] == self.data[description]
|
||||||
]
|
]
|
||||||
if len(attributes) != 1:
|
if len(attributes) > 1:
|
||||||
raise ValueError("Either No attributes or multiple found for %s" % description)
|
raise ValueError("Either No attributes or multiple found for %s" % description)
|
||||||
|
|
||||||
|
if len(attributes) == 0:
|
||||||
|
# We attempt to perform the clean on the fly
|
||||||
|
cleaner_cls = all_cleaner_map[description]
|
||||||
|
attributes = [cleaner_cls(self.data[description]).process()]
|
||||||
|
|
||||||
setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])
|
setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])
|
||||||
|
|
||||||
|
self.set_wall_type()
|
||||||
|
self.set_floor_type()
|
||||||
|
|
||||||
def set_age_band(self):
|
def set_age_band(self):
|
||||||
"""
|
"""
|
||||||
Sets a cleaned version of the age band of the property given the EPC data
|
Sets a cleaned version of the age band of the property given the EPC data
|
||||||
|
|
@ -275,14 +311,20 @@ class Property(Definitions):
|
||||||
if not self.data:
|
if not self.data:
|
||||||
raise ValueError("Property does not contain data")
|
raise ValueError("Property does not contain data")
|
||||||
|
|
||||||
self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
|
self.construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
|
||||||
|
self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
|
||||||
|
|
||||||
def set_is_in_conservation_area(self, in_conservation_area):
|
def set_spatial(self, spatial: pd.DataFrame):
|
||||||
"""
|
"""
|
||||||
Sets whether the property is in a conservation area given the output of the ConservationAreaClient
|
Sets whether the property is in a conservation area given the output of the ConservationAreaClient
|
||||||
:param in_conservation_area: string value, indicating whether the property is in a conservation area
|
:param spatial: Dataframe, containing the spatial data for the property
|
||||||
"""
|
"""
|
||||||
self.in_conservation_area = in_conservation_area
|
self.in_conservation_area = spatial["conservation_status"].values[0]
|
||||||
|
self.is_listed = spatial["is_listed_building"].values[0]
|
||||||
|
self.is_heritage = spatial["is_heritage_building"].values[0]
|
||||||
|
|
||||||
|
if self.in_conservation_area is True | self.is_listed is True | self.is_heritage is True:
|
||||||
|
self.restricted_measures = True
|
||||||
|
|
||||||
def set_year_built(self):
|
def set_year_built(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -349,17 +391,6 @@ class Property(Definitions):
|
||||||
else:
|
else:
|
||||||
self.mains_gas = map[self.data["mains-gas-flag"]]
|
self.mains_gas = map[self.data["mains-gas-flag"]]
|
||||||
|
|
||||||
def set_floor_height(self):
|
|
||||||
"""
|
|
||||||
Sets the floor height of the property
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
|
|
||||||
self.floor_height = None
|
|
||||||
else:
|
|
||||||
self.floor_height = float(self.data["floor-height"])
|
|
||||||
|
|
||||||
def _clean_upload_data(self, to_update):
|
def _clean_upload_data(self, to_update):
|
||||||
for k, v in to_update.items():
|
for k, v in to_update.items():
|
||||||
if v in self.DATA_ANOMALY_MATCHES:
|
if v in self.DATA_ANOMALY_MATCHES:
|
||||||
|
|
@ -443,21 +474,210 @@ class Property(Definitions):
|
||||||
|
|
||||||
return property_details_epc
|
return property_details_epc
|
||||||
|
|
||||||
def set_wall_area(self):
|
def get_spatial_data(self, uprn_filenames):
|
||||||
"""
|
|
||||||
This method is placeholder
|
|
||||||
It implements our floor area model to produce an estimate of the property's insulatable wall area
|
|
||||||
"""
|
|
||||||
|
|
||||||
import random
|
|
||||||
self.insulation_wall_area = random.uniform(60, 100)
|
|
||||||
|
|
||||||
def set_floor_area(self):
|
|
||||||
"""
|
|
||||||
Sets the floor area based on the EPC data
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# We don't know the number of floors at the moment so we're going to assume 1
|
Given a property's UPRN, this method will pull the associated spatial data from s3
|
||||||
# however this is something we'll need to use Verisk data for
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
if self.uprn is None:
|
||||||
|
raise ValueError("URPN is not set, run search_address_epc")
|
||||||
|
|
||||||
|
# We get the file name for the uprn
|
||||||
|
filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
|
||||||
|
if filtered_df.empty:
|
||||||
|
logger.warning("Could not find file containing UPRNS")
|
||||||
|
return None
|
||||||
|
|
||||||
|
filename = filtered_df.iloc[0]['filenames']
|
||||||
|
|
||||||
|
spatial_data = read_dataframe_from_s3_parquet(
|
||||||
|
bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}"
|
||||||
|
)
|
||||||
|
|
||||||
|
spatial = spatial_data[spatial_data["UPRN"] == self.uprn]
|
||||||
|
|
||||||
|
# Pull out spatial features
|
||||||
|
self.set_spatial(spatial)
|
||||||
|
|
||||||
|
def _filter_property_dimensions(self, property_dimensions):
|
||||||
|
"""
|
||||||
|
Will filter the property dimensions dataframe to only include the relevant rows for the property
|
||||||
|
:param property_dimensions:
|
||||||
|
:return: filtered property dimensions dataframe
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])]
|
||||||
|
|
||||||
|
if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES:
|
||||||
|
result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)]
|
||||||
|
|
||||||
|
if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]:
|
||||||
|
result = result[(result["BUILT_FORM"] == self.data["built-form"])]
|
||||||
|
|
||||||
|
return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean()
|
||||||
|
|
||||||
|
def set_basic_property_dimensions(self):
|
||||||
|
"""
|
||||||
|
This method sets the number of floors of the property, using a simple approach based on an estimate for
|
||||||
|
average room size, number of rooms and total floor area
|
||||||
|
|
||||||
|
It sets the perimeter of the property, using a simple approach based on an estimate for average room size,
|
||||||
|
number of rooms and total floor area
|
||||||
|
|
||||||
|
Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on
|
||||||
|
medians across the EPC data
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
self.floor_area = float(self.data["total-floor-area"])
|
self.floor_area = float(self.data["total-floor-area"])
|
||||||
|
|
||||||
|
if not self.data["number-habitable-rooms"] or (
|
||||||
|
self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES
|
||||||
|
):
|
||||||
|
if self.property_dimensions is None:
|
||||||
|
property_dimensions = read_dataframe_from_s3_parquet(
|
||||||
|
bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.data['local-authority']}.parquet"
|
||||||
|
)
|
||||||
|
self.property_dimensions = self._filter_property_dimensions(property_dimensions)
|
||||||
|
|
||||||
|
if not self.data["number-habitable-rooms"]:
|
||||||
|
self.number_of_rooms = float(self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round())
|
||||||
|
else:
|
||||||
|
self.number_of_rooms = float(self.data["number-habitable-rooms"])
|
||||||
|
|
||||||
|
if self.data["property-type"] == "House":
|
||||||
|
self.number_of_floors = estimate_floors(self.floor_area, self.number_of_rooms)
|
||||||
|
elif self.data["property-type"] == "Flat":
|
||||||
|
self.number_of_floors = 1
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Implement me")
|
||||||
|
|
||||||
|
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
|
||||||
|
self.floor_height = float(self.property_dimensions["FLOOR_HEIGHT"].round(2))
|
||||||
|
else:
|
||||||
|
self.floor_height = float(self.data["floor-height"])
|
||||||
|
|
||||||
|
self.perimeter = estimate_perimeter(
|
||||||
|
self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors
|
||||||
|
)
|
||||||
|
|
||||||
|
self.insulation_wall_area = estimate_wall_area(
|
||||||
|
num_floors=self.number_of_floors, floor_height=self.floor_height, perimeter=self.perimeter
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_wall_type(self):
|
||||||
|
"""
|
||||||
|
This method sets the wall type of the property, using a simple approach based on the wall description
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
self.wall_type = get_wall_type(**self.walls)
|
||||||
|
|
||||||
|
def set_floor_type(self):
|
||||||
|
"""
|
||||||
|
This method sets the floor type of the property, which is used for calculating u-values
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
self.floor_type = "suspended" if self.floor["is_suspended"] else "solid"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None):
|
||||||
|
for k in component_rename_cols:
|
||||||
|
component_data[f"{rename_prefix}_{k}"] = component_data[k]
|
||||||
|
|
||||||
|
component_data = {
|
||||||
|
k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols
|
||||||
|
}
|
||||||
|
|
||||||
|
return component_data
|
||||||
|
|
||||||
|
def get_model_data(self):
|
||||||
|
"""
|
||||||
|
This method extracts cleaned data from the property object, which is used in our machine learning models
|
||||||
|
|
||||||
|
This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor.
|
||||||
|
|
||||||
|
For future iterations of this, we probably want to implement a singular method in DataProcessor, which can
|
||||||
|
be used in the etl code and in here
|
||||||
|
|
||||||
|
:return: dictionary of model data to be scored in the model
|
||||||
|
"""
|
||||||
|
|
||||||
|
drop_cols = ["original_description", "clean_description"]
|
||||||
|
insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"]
|
||||||
|
insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"]
|
||||||
|
|
||||||
|
walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls")
|
||||||
|
roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof")
|
||||||
|
floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor")
|
||||||
|
|
||||||
|
windows = self._extract_component(self.windows, [], drop_cols + ["no_data"])
|
||||||
|
fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel")
|
||||||
|
main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"])
|
||||||
|
main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols)
|
||||||
|
hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater")
|
||||||
|
|
||||||
|
# We'll need to clean second heating
|
||||||
|
second_heating = self.data["secondheat-description"]
|
||||||
|
|
||||||
|
epc_raw_columns = [
|
||||||
|
'TRANSACTION_TYPE',
|
||||||
|
'ENERGY_TARIFF',
|
||||||
|
'PROPERTY_TYPE',
|
||||||
|
'UPRN',
|
||||||
|
'NUMBER_OPEN_FIREPLACES',
|
||||||
|
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||||
|
'MULTI_GLAZE_PROPORTION',
|
||||||
|
'MECHANICAL_VENTILATION',
|
||||||
|
'PHOTO_SUPPLY',
|
||||||
|
'LOW_ENERGY_LIGHTING',
|
||||||
|
'SOLAR_WATER_HEATING_FLAG',
|
||||||
|
'GLAZED_TYPE',
|
||||||
|
'CONSTITUENCY',
|
||||||
|
'NUMBER_HEATED_ROOMS',
|
||||||
|
'EXTENSION_COUNT',
|
||||||
|
]
|
||||||
|
epc_raw_data = {
|
||||||
|
k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
|
||||||
|
}
|
||||||
|
|
||||||
|
built_form_cleaning_map = {
|
||||||
|
"Flat": "Mid-Terrace",
|
||||||
|
"House": "Semi-Detached",
|
||||||
|
"Bungalow": "Detached",
|
||||||
|
"Maisonette": "Mid-Terrace"
|
||||||
|
}
|
||||||
|
|
||||||
|
built_form = self.data["built-form"]
|
||||||
|
if built_form in self.DATA_ANOMALY_MATCHES:
|
||||||
|
# TODO: If built form isn't captured, we use the most common value for that property type - we shall
|
||||||
|
# improve this methodology
|
||||||
|
built_form = built_form_cleaning_map.get(self.data["property-type"])
|
||||||
|
if not built_form:
|
||||||
|
raise NotImplementedError("Not handled this property type when cleaning built form")
|
||||||
|
|
||||||
|
property_data = {
|
||||||
|
**walls,
|
||||||
|
**roof,
|
||||||
|
**floor,
|
||||||
|
**fuel,
|
||||||
|
**main_heating,
|
||||||
|
**main_heating_controls,
|
||||||
|
**hotwater,
|
||||||
|
**windows,
|
||||||
|
"SECONDHEAT_DESCRIPTION": second_heating,
|
||||||
|
"DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
|
||||||
|
"SAP": float(self.data["current-energy-efficiency"]),
|
||||||
|
"CARBON": float(self.data["co2-emissions-current"]),
|
||||||
|
"HEAT_DEMAND": float(self.data["energy-consumption-current"]),
|
||||||
|
"estimated_perimeter": self.perimeter,
|
||||||
|
"CONSTRUCTION_AGE_BAND": self.construction_age_band,
|
||||||
|
"FLOOR_HEIGHT": self.floor_height,
|
||||||
|
"NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
|
||||||
|
"TOTAL_FLOOR_AREA": self.floor_area,
|
||||||
|
**epc_raw_data,
|
||||||
|
"BUILT_FORM": built_form,
|
||||||
|
}
|
||||||
|
|
||||||
|
return property_data
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,17 @@
|
||||||
from backend.app.db.models.materials import Material
|
from backend.app.db.models.materials import Material
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=128)
|
||||||
def get_materials(session):
|
def get_materials(session):
|
||||||
"""
|
"""
|
||||||
This function will retrieve all materials from the database.
|
This function will retrieve all materials from the database.
|
||||||
:return: A list of Material objects if successful, an empty list otherwise.
|
:return: A list of Material objects if successful, an empty list otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
TODO: It might not be the best choice to store the materials data in a database table since thi
|
||||||
|
table probably won't be very large and won't be updated that often. It might be better to
|
||||||
|
store this data in s3 load it into memory when the app starts up. We will test this
|
||||||
"""
|
"""
|
||||||
|
|
||||||
materials = session.query(Material).filter(Material.is_active).all()
|
materials = session.query(Material).filter(Material.is_active).all()
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ class MaterialType(enum.Enum):
|
||||||
solid_floor_insulation = "solid_floor_insulation"
|
solid_floor_insulation = "solid_floor_insulation"
|
||||||
external_wall_insulation = "external_wall_insulation"
|
external_wall_insulation = "external_wall_insulation"
|
||||||
internal_wall_insulation = "internal_wall_insulation"
|
internal_wall_insulation = "internal_wall_insulation"
|
||||||
|
cavity_wall_insulation = "cavity_wall_insulation"
|
||||||
|
|
||||||
|
|
||||||
class DepthUnit(enum.Enum):
|
class DepthUnit(enum.Enum):
|
||||||
|
|
|
||||||
|
|
@ -1,50 +1,41 @@
|
||||||
from collections import defaultdict
|
|
||||||
from fastapi import APIRouter, Depends
|
|
||||||
from backend.app.db.models.portfolio import rating_lookup
|
|
||||||
from backend.app.dependencies import validate_token
|
|
||||||
from backend.app.plan.schemas import PlanTriggerRequest
|
|
||||||
from backend.app.utils import read_csv_from_s3
|
|
||||||
from backend.app.config import get_settings
|
|
||||||
from backend.Property import Property
|
|
||||||
from epc_api.client import EpcClient
|
|
||||||
from utils.logger import setup_logger
|
|
||||||
from utils.s3 import read_from_s3
|
|
||||||
from recommendations.FloorRecommendations import FloorRecommendations
|
|
||||||
from recommendations.WallRecommendations import WallRecommendations
|
|
||||||
from recommendations.config import UPGRADES_MAP
|
|
||||||
from utils.uvalue_estimates import classify_decile_newvalues
|
|
||||||
from backend.app.db.utils import row2dict
|
|
||||||
from starlette.responses import Response
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
from sqlalchemy.exc import IntegrityError, OperationalError
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import msgpack
|
from epc_api.client import EpcClient
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from starlette.responses import Response
|
||||||
|
|
||||||
# model apis
|
from backend.app.config import get_settings
|
||||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
from backend.app.db.connection import db_engine
|
||||||
|
|
||||||
# database interaction functions
|
|
||||||
from backend.app.db.functions.property_functions import (
|
|
||||||
create_property, create_property_targets, update_property_data, create_property_details_epc
|
|
||||||
)
|
|
||||||
from backend.app.db.functions.materials_functions import get_materials
|
from backend.app.db.functions.materials_functions import get_materials
|
||||||
|
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||||
|
from backend.app.db.functions.property_functions import (
|
||||||
|
create_property, create_property_details_epc, create_property_targets, update_property_data
|
||||||
|
)
|
||||||
from backend.app.db.functions.recommendations_functions import (
|
from backend.app.db.functions.recommendations_functions import (
|
||||||
create_plan, create_plan_recommendations, upload_recommendations
|
create_plan, create_plan_recommendations, upload_recommendations
|
||||||
)
|
)
|
||||||
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
from backend.app.db.models.portfolio import rating_lookup
|
||||||
from backend.app.db.connection import db_engine
|
from backend.app.dependencies import validate_token
|
||||||
|
from backend.app.plan.schemas import PlanTriggerRequest
|
||||||
|
from backend.app.plan.utils import (
|
||||||
|
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
|
||||||
|
)
|
||||||
|
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
|
||||||
|
|
||||||
from model_data.optimiser.GainOptimiser import GainOptimiser
|
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||||
from model_data.optimiser.CostOptimiser import CostOptimiser
|
from backend.Property import Property
|
||||||
from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
|
from etl.epc.DataProcessor import DataProcessor
|
||||||
from model_data.optimiser.optimiser_functions import prepare_input_measures
|
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
from recommendations.FloorRecommendations import FloorRecommendations
|
||||||
from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
|
from recommendations.optimiser.CostOptimiser import CostOptimiser
|
||||||
|
from recommendations.optimiser.GainOptimiser import GainOptimiser
|
||||||
# TODO: This is placeholder until data is stored in DB
|
from recommendations.optimiser.optimiser_functions import prepare_input_measures
|
||||||
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
|
from recommendations.WallRecommendations import WallRecommendations
|
||||||
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import read_dataframe_from_s3_parquet
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
@ -55,147 +46,25 @@ router = APIRouter(
|
||||||
responses={404: {"description": "Not found"}}
|
responses={404: {"description": "Not found"}}
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: Load this data from db
|
|
||||||
open_uprn_data = [
|
|
||||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
|
||||||
'LONGITUDE': -0.0540506},
|
|
||||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
|
||||||
'LONGITUDE': -0.0498772},
|
|
||||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
|
||||||
'LONGITUDE': -0.226392},
|
|
||||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
||||||
'LONGITUDE': -0.0792445},
|
|
||||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
||||||
'LONGITUDE': -0.0792445},
|
|
||||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
|
||||||
'LONGITUDE': -0.0468833},
|
|
||||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
|
||||||
'LONGITUDE': -0.1362513},
|
|
||||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
|
||||||
'LONGITUDE': -0.0823165}
|
|
||||||
]
|
|
||||||
|
|
||||||
in_conservation_area_data = [
|
|
||||||
{'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
|
|
||||||
{'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
|
|
||||||
{'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
|
|
||||||
{'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
|
|
||||||
{'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
|
|
||||||
{'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
|
|
||||||
{'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
|
|
||||||
{'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
|
|
||||||
]
|
|
||||||
|
|
||||||
# TODO: db
|
|
||||||
floors_decile_data = {
|
|
||||||
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
|
|
||||||
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 50., 56., 69., 77.6, 87., 98., 112.,
|
|
||||||
127., 150., 2279.]}
|
|
||||||
|
|
||||||
walls_decile_data = {
|
|
||||||
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
|
|
||||||
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
|
|
||||||
120., 2279.]}
|
|
||||||
|
|
||||||
|
|
||||||
def filter_materials(materials):
|
|
||||||
materials_by_type = defaultdict(list)
|
|
||||||
|
|
||||||
for material in materials:
|
|
||||||
material = row2dict(material)
|
|
||||||
material_type = material["type"]
|
|
||||||
materials_by_type[material_type].append(material)
|
|
||||||
|
|
||||||
# Optionally, you can convert the defaultdict to a normal dict if desired
|
|
||||||
materials_by_type = dict(materials_by_type)
|
|
||||||
|
|
||||||
return materials_by_type
|
|
||||||
|
|
||||||
|
|
||||||
def insert_temp_recommendation_id(property_recommendations):
|
|
||||||
"""
|
|
||||||
Creates a temporary recommendation id which is needed for
|
|
||||||
filtering recommendations between default and no, after the optimiser has been
|
|
||||||
run
|
|
||||||
:param property_recommendations: nested list of recommendations, grouped by data_types
|
|
||||||
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
|
|
||||||
integer inserted
|
|
||||||
"""
|
|
||||||
idx = 0
|
|
||||||
|
|
||||||
for recs in property_recommendations:
|
|
||||||
for rec in recs:
|
|
||||||
rec["recommendation_id"] = idx
|
|
||||||
idx += 1
|
|
||||||
|
|
||||||
return property_recommendations
|
|
||||||
|
|
||||||
|
|
||||||
def get_cleaned():
|
|
||||||
"""
|
|
||||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
|
||||||
descriptions for the epc dataset
|
|
||||||
|
|
||||||
This data is stored in MessagePack format and therefore needs to be decoded
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
cleaned = read_from_s3(
|
|
||||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
||||||
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
|
|
||||||
)
|
|
||||||
|
|
||||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
||||||
|
|
||||||
return cleaned
|
|
||||||
|
|
||||||
|
|
||||||
def create_recommendation_scoring_data(
|
|
||||||
property: Property,
|
|
||||||
recommendation: dict,
|
|
||||||
starting_epc_data: pd.DataFrame,
|
|
||||||
ending_epc_data: pd.DataFrame,
|
|
||||||
fixed_data: pd.DataFrame,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
This wrapper function prepares data to be passed to the sap model api
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
scoring_dict = {
|
|
||||||
"UPRN": property.data["uprn"],
|
|
||||||
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
|
|
||||||
"LOCAL_AUTHORITY": property.data["local-authority"],
|
|
||||||
**starting_epc_data.to_dict("records")[0],
|
|
||||||
**ending_epc_data.to_dict("records")[0],
|
|
||||||
**fixed_data.to_dict("records")[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
# We update the description to indicate it's insulated
|
|
||||||
if recommendation["type"] == "wall_insulation":
|
|
||||||
scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
|
|
||||||
elif recommendation["type"] == "floor_insulation":
|
|
||||||
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
|
|
||||||
else:
|
|
||||||
raise NotImplementedError("Implement me")
|
|
||||||
|
|
||||||
return scoring_dict
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/trigger")
|
@router.post("/trigger")
|
||||||
async def trigger_plan(body: PlanTriggerRequest):
|
async def trigger_plan(body: PlanTriggerRequest):
|
||||||
logger.info("Connecting to db")
|
logger.info("Connecting to db")
|
||||||
session = sessionmaker(bind=db_engine)()
|
session = sessionmaker(bind=db_engine)()
|
||||||
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
created_at = datetime.now().isoformat()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
session.begin()
|
session.begin()
|
||||||
logger.info("Getting the inputs")
|
logger.info("Getting the inputs")
|
||||||
# Read in the trigger file from s3
|
|
||||||
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
|
|
||||||
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
|
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
|
||||||
|
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
|
||||||
|
uprn_filenames = read_dataframe_from_s3_parquet(
|
||||||
|
bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
|
||||||
|
)
|
||||||
|
cleaning_data = read_parquet_from_s3(
|
||||||
|
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
|
||||||
|
)
|
||||||
|
|
||||||
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
|
|
||||||
input_properties = []
|
input_properties = []
|
||||||
for config in plan_input:
|
for config in plan_input:
|
||||||
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
|
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
|
||||||
|
|
@ -228,32 +97,21 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
if not input_properties:
|
if not input_properties:
|
||||||
return Response(status_code=204)
|
return Response(status_code=204)
|
||||||
|
|
||||||
logger.info("Getting EPC, coordinates and conservation area data")
|
logger.info("Getting EPC, and spatial data")
|
||||||
for p in input_properties:
|
for p in input_properties:
|
||||||
p.search_address_epc()
|
p.search_address_epc()
|
||||||
p.set_year_built()
|
p.set_year_built()
|
||||||
|
p.get_spatial_data(uprn_filenames)
|
||||||
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
|
|
||||||
p.set_coordinates(coordinate_data)
|
|
||||||
|
|
||||||
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
|
|
||||||
"is_in_conservation_area"
|
|
||||||
)
|
|
||||||
p.set_is_in_conservation_area(in_conservation_area)
|
|
||||||
|
|
||||||
# The materials data could be cached or local so we don't need to make
|
# The materials data could be cached or local so we don't need to make
|
||||||
# consistent requests to the backend for
|
# consistent requests to the backend for
|
||||||
# the same data
|
# the same data
|
||||||
# TODO: It might not be the best choice to store the materials data in a database table since thi
|
|
||||||
# table probably won't be very large and won't be updated that often. It might be better to
|
|
||||||
# store this data in s3 load it into memory when the app starts up. We will test this
|
|
||||||
|
|
||||||
logger.info("Reading in materials and cleaned datasets")
|
logger.info("Reading in materials and cleaned datasets")
|
||||||
materials = get_materials(session)
|
materials = get_materials(session)
|
||||||
materials_by_type = filter_materials(materials)
|
materials_by_type = filter_materials(materials)
|
||||||
cleaned = get_cleaned()
|
cleaned = get_cleaned()
|
||||||
|
|
||||||
logger.info("Getting components and properties recommendations")
|
logger.info("Getting components and epc recommendations")
|
||||||
|
|
||||||
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
|
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
|
||||||
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
|
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
|
||||||
|
|
@ -263,34 +121,13 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
for p in input_properties:
|
for p in input_properties:
|
||||||
property_recommendations = []
|
property_recommendations = []
|
||||||
|
|
||||||
# For each property, classiy floor area decide
|
|
||||||
total_floor_area_group_decile = classify_decile_newvalues(
|
|
||||||
decile_boundaries=floors_decile_data["decile_boundaries"],
|
|
||||||
decile_labels=floors_decile_data["decile_labels"],
|
|
||||||
new_values=[float(p.data["total-floor-area"])],
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
# Property recommendations
|
# Property recommendations
|
||||||
p.get_components(cleaned)
|
p.get_components(cleaned)
|
||||||
|
|
||||||
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
|
|
||||||
# database
|
|
||||||
floors_u_value_estimate = [
|
|
||||||
x for x in uvalue_estimates_floors
|
|
||||||
if (x['local-authority'] == p.data["local-authority"]) &
|
|
||||||
(x['property-type'] == p.data["property-type"]) &
|
|
||||||
(x['built-form'] == p.data["built-form"]) &
|
|
||||||
(x['floor-energy-eff'] == p.data["floor-energy-eff"] if p.data[
|
|
||||||
"floor-energy-eff"] != 'N/A' else True) &
|
|
||||||
(x['floor-env-eff'] == p.data["floor-env-eff"] if p.data["floor-env-eff"] != 'N/A' else True)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Floor recommendations
|
# Floor recommendations
|
||||||
floor_recommender = FloorRecommendations(
|
floor_recommender = FloorRecommendations(
|
||||||
property_instance=p,
|
property_instance=p,
|
||||||
uvalue_estimates=floors_u_value_estimate,
|
materials=materials_by_type["floor"],
|
||||||
total_floor_area_group_decile=total_floor_area_group_decile,
|
|
||||||
materials=materials_by_type["suspended_floor_insulation"] + materials_by_type["solid_floor_insulation"],
|
|
||||||
)
|
)
|
||||||
floor_recommender.recommend()
|
floor_recommender.recommend()
|
||||||
|
|
||||||
|
|
@ -298,30 +135,10 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
property_recommendations.append(floor_recommender.recommendations)
|
property_recommendations.append(floor_recommender.recommendations)
|
||||||
|
|
||||||
# Wall recommendations
|
# Wall recommendations
|
||||||
# We would make this u-value query directly to the database
|
|
||||||
total_floor_area_group_decile = classify_decile_newvalues(
|
|
||||||
decile_boundaries=walls_decile_data["decile_boundaries"],
|
|
||||||
decile_labels=walls_decile_data["decile_labels"],
|
|
||||||
new_values=[float(p.data["total-floor-area"])],
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
|
|
||||||
# database
|
|
||||||
walls_u_value_estimate = [
|
|
||||||
x for x in uvalue_estimates_walls
|
|
||||||
if (x['local-authority'] == p.data["local-authority"]) &
|
|
||||||
(x['property-type'] == p.data["property-type"]) &
|
|
||||||
(x['built-form'] == p.data["built-form"]) &
|
|
||||||
(x['walls-energy-eff'] == p.data["walls-energy-eff"] if p.data[
|
|
||||||
"walls-energy-eff"] != 'N/A' else True) &
|
|
||||||
(x['walls-env-eff'] == p.data["walls-env-eff"] if p.data["walls-env-eff"] != 'N/A' else True)
|
|
||||||
]
|
|
||||||
|
|
||||||
wall_recomender = WallRecommendations(
|
wall_recomender = WallRecommendations(
|
||||||
property_instance=p,
|
property_instance=p,
|
||||||
uvalue_estimates=walls_u_value_estimate,
|
materials=materials_by_type["walls"]
|
||||||
total_floor_area_group_decile=total_floor_area_group_decile,
|
|
||||||
materials=materials_by_type["external_wall_insulation"] + materials_by_type["internal_wall_insulation"]
|
|
||||||
)
|
)
|
||||||
wall_recomender.recommend()
|
wall_recomender.recommend()
|
||||||
|
|
||||||
|
|
@ -337,12 +154,8 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
recommendations[p.id] = property_recommendations
|
recommendations[p.id] = property_recommendations
|
||||||
|
|
||||||
# Finally, we'll prepare data for predicting the impact on SAP
|
# Finally, we'll prepare data for predicting the impact on SAP
|
||||||
# TODO: We should use the cleaned data from get_components in the data rather than the raw
|
|
||||||
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
|
|
||||||
# data
|
|
||||||
|
|
||||||
data_processor = DataProcessor(None, newdata=True)
|
data_processor = DataProcessor(None, newdata=True)
|
||||||
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
|
data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
|
||||||
data_processor.pre_process()
|
data_processor.pre_process()
|
||||||
|
|
||||||
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
||||||
|
|
@ -350,10 +163,10 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
fixed_data = data_processor.get_fixed_features()
|
fixed_data = data_processor.get_fixed_features()
|
||||||
|
|
||||||
# We update the ending record with the recommended updates and we set lodgement date to today
|
# We update the ending record with the recommended updates and we set lodgement date to today
|
||||||
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
|
ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
|
||||||
|
|
||||||
for recommendations_by_type in property_recommendations:
|
for recommendations_by_type in property_recommendations:
|
||||||
for rec in recommendations_by_type:
|
for i, rec in enumerate(recommendations_by_type):
|
||||||
scoring_dict = create_recommendation_scoring_data(
|
scoring_dict = create_recommendation_scoring_data(
|
||||||
property=p,
|
property=p,
|
||||||
recommendation=rec,
|
recommendation=rec,
|
||||||
|
|
@ -370,15 +183,6 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
logger.info("Preparing data for scoring in sap change api")
|
logger.info("Preparing data for scoring in sap change api")
|
||||||
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
|
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
|
||||||
|
|
||||||
# Clean the data
|
|
||||||
logger.info("Reading in cleaning dataset from s3")
|
|
||||||
cleaning_data = read_parquet_from_s3(
|
|
||||||
bucket_name=get_settings().DATA_BUCKET,
|
|
||||||
file_key="sap_change_model/cleaning_dataset.parquet",
|
|
||||||
).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
|
|
||||||
|
|
||||||
# Merge the cleaning data onto recommendations_scoring_data
|
|
||||||
|
|
||||||
# Perform the same cleaning as in the model
|
# Perform the same cleaning as in the model
|
||||||
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
|
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
|
||||||
data_to_clean=recommendations_scoring_data,
|
data_to_clean=recommendations_scoring_data,
|
||||||
|
|
@ -386,6 +190,13 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
|
||||||
).drop(columns=["LOCAL_AUTHORITY"])
|
).drop(columns=["LOCAL_AUTHORITY"])
|
||||||
|
|
||||||
|
recommendations_scoring_data = DataProcessor.clean_missings_after_description_process(
|
||||||
|
recommendations_scoring_data, [
|
||||||
|
c for c in recommendations_scoring_data.columns if
|
||||||
|
("thermal_transmittance" in c) or ("insulation_thickness" in c)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
|
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
|
||||||
file_location = sap_change_model_api.upload_scoring_data(
|
file_location = sap_change_model_api.upload_scoring_data(
|
||||||
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
|
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
|
||||||
|
|
@ -396,14 +207,17 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
|
|
||||||
# Retrieve the predictions
|
# Retrieve the predictions
|
||||||
predictions = pd.DataFrame(
|
predictions = pd.DataFrame(
|
||||||
read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
|
read_parquet_from_s3(
|
||||||
|
bucket_name=get_settings().PREDICTIONS_BUCKET,
|
||||||
|
file_key=response["storage_filepath"].split(get_settings().PREDICTIONS_BUCKET + "/")[1]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
|
predictions["predictions"] = predictions["predictions"].astype(float).round(1)
|
||||||
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
|
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
|
||||||
|
|
||||||
# Insert the predictions into the recommendations and run the optimiser
|
# Insert the predictions into the recommendations and run the optimiser
|
||||||
logger.info("Storing recommendations")
|
logger.info("Optimising recommendations")
|
||||||
for property_id in recommendations.keys():
|
for property_id in recommendations.keys():
|
||||||
|
|
||||||
property = [p for p in input_properties if p.id == property_id][0]
|
property = [p for p in input_properties if p.id == property_id][0]
|
||||||
|
|
@ -411,9 +225,11 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
|
|
||||||
for recommendations_by_type in recommendations[property_id]:
|
for recommendations_by_type in recommendations[property_id]:
|
||||||
for rec in recommendations_by_type:
|
for rec in recommendations_by_type:
|
||||||
rec["sap_points"] = property_predictions[property_predictions["recommendation_id"] == str(
|
new_sap = property_predictions[property_predictions["recommendation_id"] == str(
|
||||||
rec["recommendation_id"]
|
rec["recommendation_id"]
|
||||||
)]["RDSAP_CHANGE"].values[0]
|
)]["predictions"].values[0]
|
||||||
|
|
||||||
|
rec["sap_points"] = new_sap - float(property.data["current-energy-efficiency"])
|
||||||
|
|
||||||
if rec["sap_points"] is None:
|
if rec["sap_points"] is None:
|
||||||
raise ValueError("Sap points missing")
|
raise ValueError("Sap points missing")
|
||||||
|
|
@ -451,8 +267,6 @@ async def trigger_plan(body: PlanTriggerRequest):
|
||||||
final_recommendations = [
|
final_recommendations = [
|
||||||
rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
|
rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
|
||||||
]
|
]
|
||||||
# We update recommendations[property_id]
|
|
||||||
|
|
||||||
recommendations[property_id] = final_recommendations
|
recommendations[property_id] = final_recommendations
|
||||||
|
|
||||||
# 1) the property data
|
# 1) the property data
|
||||||
|
|
|
||||||
176
backend/app/plan/temp_script_for_flight.py
Normal file
176
backend/app/plan/temp_script_for_flight.py
Normal file
|
|
@ -0,0 +1,176 @@
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from epc_api.client import EpcClient
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from starlette.responses import Response
|
||||||
|
|
||||||
|
from backend.app.config import get_settings
|
||||||
|
from backend.app.db.connection import db_engine
|
||||||
|
from backend.app.db.functions.materials_functions import get_materials
|
||||||
|
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||||
|
from backend.app.db.functions.property_functions import (
|
||||||
|
create_property, create_property_details_epc, create_property_targets, update_property_data
|
||||||
|
)
|
||||||
|
from backend.app.db.functions.recommendations_functions import (
|
||||||
|
create_plan, create_plan_recommendations, upload_recommendations
|
||||||
|
)
|
||||||
|
from backend.app.db.models.portfolio import rating_lookup
|
||||||
|
from backend.app.dependencies import validate_token
|
||||||
|
from backend.app.plan.schemas import PlanTriggerRequest
|
||||||
|
from backend.app.plan.utils import (
|
||||||
|
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
|
||||||
|
)
|
||||||
|
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
|
||||||
|
|
||||||
|
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||||
|
from backend.Property import Property
|
||||||
|
from etl.epc.DataProcessor import DataProcessor
|
||||||
|
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||||
|
from recommendations.FloorRecommendations import FloorRecommendations
|
||||||
|
from recommendations.optimiser.CostOptimiser import CostOptimiser
|
||||||
|
from recommendations.optimiser.GainOptimiser import GainOptimiser
|
||||||
|
from recommendations.optimiser.optimiser_functions import prepare_input_measures
|
||||||
|
from recommendations.WallRecommendations import WallRecommendations
|
||||||
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import read_dataframe_from_s3_parquet
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
with open('local_data.pickle', 'rb') as f:
|
||||||
|
local_data = pickle.load(f)
|
||||||
|
|
||||||
|
with open("property_dimensions.pickle", "rb") as f:
|
||||||
|
property_dimensions = pickle.load(f)
|
||||||
|
|
||||||
|
with open("sap_change_dataset.pickle", "rb") as f:
|
||||||
|
sap_change_dataset = pickle.load(f)
|
||||||
|
|
||||||
|
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||||
|
|
||||||
|
plan_input = local_data["plan_input"]
|
||||||
|
uprn_filenames = local_data["uprn_filenames"]
|
||||||
|
local_property_data = local_data["local_property_data"]
|
||||||
|
materials = local_data["materials"]
|
||||||
|
materials_by_type = filter_materials(materials)
|
||||||
|
cleaned = local_data["cleaned"]
|
||||||
|
cleaning_data = local_data["cleaning_data"]
|
||||||
|
|
||||||
|
# Need to find some proper materials
|
||||||
|
materials_by_type["walls"] += [
|
||||||
|
{'id': 4, 'type': 'cavity_wall_insulation', 'description': 'Example Material 1',
|
||||||
|
'depths': None,
|
||||||
|
'depth_unit': None, 'cost': 20,
|
||||||
|
'cost_unit': 'gbp_sq_meter', 'r_value_per_mm': 0.0278, 'r_value_unit': 'square_meter_kelvin_per_watt',
|
||||||
|
'thermal_conductivity': 0.036, 'thermal_conductivity_unit': 'watt_per_meter_kelvin',
|
||||||
|
'link': None, 'created_at': None, 'is_active': True},
|
||||||
|
{'id': 10, 'type': "cavity_wall_insulation", 'description': 'Example Material 2',
|
||||||
|
'depths': None, 'depth_unit': None, 'cost': 25, 'cost_unit': 'gbp_sq_meter',
|
||||||
|
'r_value_per_mm': 0.02631579, 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': 0.038,
|
||||||
|
'thermal_conductivity_unit': 'watt_per_meter_kelvin',
|
||||||
|
'link': None,
|
||||||
|
'created_at': None, 'is_active': True}
|
||||||
|
]
|
||||||
|
|
||||||
|
epc_client = EpcClient(auth_token="NO-TOKEN")
|
||||||
|
|
||||||
|
input_properties = []
|
||||||
|
for i, config in enumerate(plan_input):
|
||||||
|
property_id = local_property_data[i]["id"]
|
||||||
|
input_properties.append(
|
||||||
|
Property(
|
||||||
|
postcode=config['postcode'],
|
||||||
|
address1=config['address'],
|
||||||
|
epc_client=epc_client,
|
||||||
|
id=property_id
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Getting EPC, and spatial data")
|
||||||
|
for i, p in enumerate(input_properties):
|
||||||
|
p.data = local_property_data[i]["data"]
|
||||||
|
p.uprn = local_property_data[i]["uprn"]
|
||||||
|
p.id = local_property_data[i]["id"]
|
||||||
|
p.full_sap_epc = local_property_data[i]["full_sap_epc"]
|
||||||
|
p.old_data = local_property_data[i]["old_data"]
|
||||||
|
p.is_listed = False
|
||||||
|
p.in_conservation_area = False
|
||||||
|
p.is_heritage = False
|
||||||
|
|
||||||
|
p.set_year_built()
|
||||||
|
|
||||||
|
# TODO: TESTING
|
||||||
|
p.data['number-habitable-rooms'] = 3
|
||||||
|
|
||||||
|
recommendations = {}
|
||||||
|
recommendations_scoring_data = []
|
||||||
|
|
||||||
|
for p in input_properties:
|
||||||
|
property_recommendations = []
|
||||||
|
|
||||||
|
# Property recommendations
|
||||||
|
p.get_components(cleaned)
|
||||||
|
|
||||||
|
# Floor recommendations
|
||||||
|
floor_recommender = FloorRecommendations(
|
||||||
|
property_instance=p,
|
||||||
|
materials=materials_by_type["floor"],
|
||||||
|
)
|
||||||
|
floor_recommender.recommend()
|
||||||
|
|
||||||
|
if floor_recommender.recommendations:
|
||||||
|
property_recommendations.append(floor_recommender.recommendations)
|
||||||
|
|
||||||
|
# Wall recommendations
|
||||||
|
|
||||||
|
wall_recomender = WallRecommendations(
|
||||||
|
property_instance=p,
|
||||||
|
materials=materials_by_type["walls"]
|
||||||
|
)
|
||||||
|
wall_recomender.recommend()
|
||||||
|
|
||||||
|
if wall_recomender.recommendations:
|
||||||
|
property_recommendations.append(wall_recomender.recommendations)
|
||||||
|
|
||||||
|
# We insert temporary ids into the recommendations which is important for the optimiser later
|
||||||
|
property_recommendations = insert_temp_recommendation_id(property_recommendations)
|
||||||
|
|
||||||
|
if not property_recommendations:
|
||||||
|
continue
|
||||||
|
|
||||||
|
recommendations[p.id] = property_recommendations
|
||||||
|
|
||||||
|
# Finally, we'll prepare data for predicting the impact on SAP
|
||||||
|
# TODO: We should use the cleaned data from get_components in the data rather than the raw
|
||||||
|
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
|
||||||
|
# data
|
||||||
|
|
||||||
|
data_processor = DataProcessor(None, newdata=True)
|
||||||
|
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
|
||||||
|
data_processor.pre_process()
|
||||||
|
|
||||||
|
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
||||||
|
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
|
||||||
|
fixed_data = data_processor.get_fixed_features()
|
||||||
|
|
||||||
|
# We update the ending record with the recommended updates and we set lodgement date to today
|
||||||
|
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
|
||||||
|
|
||||||
|
for recommendations_by_type in property_recommendations:
|
||||||
|
for rec in recommendations_by_type:
|
||||||
|
scoring_dict = create_recommendation_scoring_data(
|
||||||
|
property=p,
|
||||||
|
recommendation=rec,
|
||||||
|
starting_epc_data=starting_epc_data,
|
||||||
|
ending_epc_data=ending_epc_data,
|
||||||
|
fixed_data=fixed_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
recommendations_scoring_data.append(scoring_dict)
|
||||||
|
|
||||||
|
# cleanup
|
||||||
|
del data_processor
|
||||||
187
backend/app/plan/utils.py
Normal file
187
backend/app/plan/utils.py
Normal file
|
|
@ -0,0 +1,187 @@
|
||||||
|
import pandas as pd
|
||||||
|
from backend.Property import Property
|
||||||
|
from collections import defaultdict
|
||||||
|
from utils.s3 import read_from_s3
|
||||||
|
|
||||||
|
from recommendations.config import UPGRADES_MAP
|
||||||
|
from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
|
||||||
|
|
||||||
|
from backend.app.db.utils import row2dict
|
||||||
|
from backend.app.config import get_settings
|
||||||
|
import msgpack
|
||||||
|
|
||||||
|
|
||||||
|
def filter_materials(materials):
|
||||||
|
materials_by_type = defaultdict(list)
|
||||||
|
|
||||||
|
mapping = {
|
||||||
|
"walls": ["internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"],
|
||||||
|
"floor": ["suspended_floor_insulation", "solid_floor_insulation"]
|
||||||
|
}
|
||||||
|
|
||||||
|
materials = [row2dict(material) for material in materials]
|
||||||
|
|
||||||
|
for component, types in mapping.items():
|
||||||
|
materials_by_type[component] = [part for part in materials if part["type"] in types]
|
||||||
|
|
||||||
|
return dict(materials_by_type)
|
||||||
|
|
||||||
|
|
||||||
|
def insert_temp_recommendation_id(property_recommendations):
|
||||||
|
"""
|
||||||
|
Creates a temporary recommendation id which is needed for
|
||||||
|
filtering recommendations between default and no, after the optimiser has been
|
||||||
|
run
|
||||||
|
:param property_recommendations: nested list of recommendations, grouped by data_types
|
||||||
|
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
|
||||||
|
integer inserted
|
||||||
|
"""
|
||||||
|
idx = 0
|
||||||
|
|
||||||
|
for recs in property_recommendations:
|
||||||
|
for rec in recs:
|
||||||
|
rec["recommendation_id"] = idx
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
return property_recommendations
|
||||||
|
|
||||||
|
|
||||||
|
def get_cleaned():
|
||||||
|
"""
|
||||||
|
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||||
|
descriptions for the epc dataset
|
||||||
|
|
||||||
|
This data is stored in MessagePack format and therefore needs to be decoded
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
cleaned = read_from_s3(
|
||||||
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||||
|
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
|
||||||
|
)
|
||||||
|
|
||||||
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def create_recommendation_scoring_data(
|
||||||
|
property: Property,
|
||||||
|
recommendation: dict,
|
||||||
|
starting_epc_data: pd.DataFrame,
|
||||||
|
ending_epc_data: pd.DataFrame,
|
||||||
|
fixed_data: pd.DataFrame,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
This wrapper function prepares data to be passed to the sap model api
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
|
||||||
|
scoring_dict = {
|
||||||
|
"UPRN": property.data["uprn"],
|
||||||
|
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
|
||||||
|
"LOCAL_AUTHORITY": property.data["local-authority"],
|
||||||
|
**starting_epc_data.to_dict("records")[0],
|
||||||
|
**ending_epc_data.to_dict("records")[0],
|
||||||
|
**fixed_data.to_dict("records")[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set staring u-values if we don't have them
|
||||||
|
if not scoring_dict["walls_thermal_transmittance"]:
|
||||||
|
scoring_dict["walls_thermal_transmittance"] = get_wall_u_value(
|
||||||
|
clean_description=property.walls["clean_description"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
|
||||||
|
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if not scoring_dict["floor_thermal_transmittance"]:
|
||||||
|
scoring_dict["floor_thermal_transmittance"] = get_floor_u_value(
|
||||||
|
floor_type=property.floor_type,
|
||||||
|
area=property.floor_area,
|
||||||
|
perimeter=property.perimeter,
|
||||||
|
wall_type=property.wall_type,
|
||||||
|
insulation_thickness=property.floor["insulation_thickness"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not scoring_dict["roof_thermal_transmittance"]:
|
||||||
|
scoring_dict["roof_thermal_transmittance"] = get_roof_u_value(
|
||||||
|
insulation_thickness=property.roof["insulation_thickness"],
|
||||||
|
has_dwelling_above=property.roof["has_dwelling_above"],
|
||||||
|
is_loft=property.roof["is_loft"],
|
||||||
|
is_roof_room=property.roof["is_roof_room"],
|
||||||
|
is_thatched=property.roof["is_thatched"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
is_flat=property.roof["is_flat"],
|
||||||
|
is_pitched=property.roof["is_pitched"],
|
||||||
|
is_at_rafters=property.roof["is_at_rafters"],
|
||||||
|
)
|
||||||
|
|
||||||
|
for col in [
|
||||||
|
"walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness"
|
||||||
|
]:
|
||||||
|
if scoring_dict[col] is None:
|
||||||
|
scoring_dict[col] = "none"
|
||||||
|
|
||||||
|
# We update the description to indicate it's insulated
|
||||||
|
if recommendation["type"] == "wall_insulation":
|
||||||
|
# The upgrade made here is to the u-value of the walls and the description of the
|
||||||
|
# insulation thickness
|
||||||
|
scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
|
||||||
|
scoring_dict["walls_insulation_thickness_ENDING"] = "above average"
|
||||||
|
else:
|
||||||
|
if not scoring_dict["walls_thermal_transmittance_ENDING"]:
|
||||||
|
scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value(
|
||||||
|
clean_description=property.walls["clean_description"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
|
||||||
|
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
|
||||||
|
)
|
||||||
|
|
||||||
|
if scoring_dict["walls_insulation_thickness_ENDING"] is None:
|
||||||
|
scoring_dict["walls_insulation_thickness_ENDING"] = "none"
|
||||||
|
|
||||||
|
# Update description to indicate it's insulate
|
||||||
|
if recommendation["type"] == "floor_insulation":
|
||||||
|
|
||||||
|
if len(recommendation["parts"]) > 1:
|
||||||
|
raise NotImplementedError("Have more than 1 floor insulation part - handle this case")
|
||||||
|
|
||||||
|
scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
|
||||||
|
# We don't really see above average for this in the training data
|
||||||
|
scoring_dict["floor_insulation_thickness_ENDING"] = "average"
|
||||||
|
else:
|
||||||
|
if not scoring_dict["floor_thermal_transmittance_ENDING"]:
|
||||||
|
scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(
|
||||||
|
floor_type=property.floor_type,
|
||||||
|
area=property.floor_area,
|
||||||
|
perimeter=property.perimeter,
|
||||||
|
wall_type=property.wall_type,
|
||||||
|
insulation_thickness=property.floor["insulation_thickness"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
)
|
||||||
|
|
||||||
|
if scoring_dict["floor_insulation_thickness_ENDING"] is None:
|
||||||
|
scoring_dict["floor_insulation_thickness_ENDING"] = "none"
|
||||||
|
|
||||||
|
if recommendation["type"] not in ["wall_insulation", "floor_insulation"]:
|
||||||
|
raise NotImplementedError("Implement me")
|
||||||
|
|
||||||
|
if not scoring_dict["roof_thermal_transmittance_ENDING"]:
|
||||||
|
scoring_dict["roof_thermal_transmittance_ENDING"] = get_roof_u_value(
|
||||||
|
insulation_thickness=property.roof["insulation_thickness"],
|
||||||
|
has_dwelling_above=property.roof["has_dwelling_above"],
|
||||||
|
is_loft=property.roof["is_loft"],
|
||||||
|
is_roof_room=property.roof["is_roof_room"],
|
||||||
|
is_thatched=property.roof["is_thatched"],
|
||||||
|
age_band=property.age_band,
|
||||||
|
is_flat=property.roof["is_flat"],
|
||||||
|
is_pitched=property.roof["is_pitched"],
|
||||||
|
is_at_rafters=property.roof["is_at_rafters"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if scoring_dict["roof_insulation_thickness_ENDING"] is None:
|
||||||
|
scoring_dict["roof_insulation_thickness_ENDING"] = "none"
|
||||||
|
|
||||||
|
return scoring_dict
|
||||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
|
||||||
if sap_points <= 0 or sap_points > 100:
|
if sap_points <= 0 or sap_points > 100:
|
||||||
raise ValueError("SAP points should be between 1 and 100.")
|
raise ValueError("SAP points should be between 1 and 100.")
|
||||||
|
|
||||||
if sap_points > 91:
|
if sap_points >= 92:
|
||||||
return "A"
|
return "A"
|
||||||
elif sap_points > 80:
|
elif sap_points >= 81:
|
||||||
return "B"
|
return "B"
|
||||||
elif sap_points > 69:
|
elif sap_points >= 69:
|
||||||
return "C"
|
return "C"
|
||||||
elif sap_points > 55:
|
elif sap_points >= 55:
|
||||||
return "D"
|
return "D"
|
||||||
elif sap_points > 39:
|
elif sap_points >= 39:
|
||||||
return "E"
|
return "E"
|
||||||
elif sap_points > 21:
|
elif sap_points >= 21:
|
||||||
return "F"
|
return "F"
|
||||||
else:
|
else:
|
||||||
return "G"
|
return "G"
|
||||||
|
|
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
|
||||||
elif epc == "B":
|
elif epc == "B":
|
||||||
return 81
|
return 81
|
||||||
elif epc == "C":
|
elif epc == "C":
|
||||||
return 70
|
return 69
|
||||||
elif epc == "D":
|
elif epc == "D":
|
||||||
return 56
|
return 55
|
||||||
elif epc == "E":
|
elif epc == "E":
|
||||||
return 40
|
return 39
|
||||||
elif epc == "F":
|
elif epc == "F":
|
||||||
return 22
|
return 21
|
||||||
elif epc == "G":
|
elif epc == "G":
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -62,14 +62,14 @@ class SAPChangeModelAPI:
|
||||||
logger.info("Making request to sap change api")
|
logger.info("Making request to sap change api")
|
||||||
url = f"{self.base_url}/sapmodel/predict"
|
url = f"{self.base_url}/sapmodel/predict"
|
||||||
payload = {
|
payload = {
|
||||||
"file_location": f"s3://retrofit-data-dev/{file_location}",
|
"file_location": file_location,
|
||||||
"property_id": "", # This should get removed
|
"property_id": "", # This should get removed
|
||||||
"portfolio_id": self.portfolio_id,
|
"portfolio_id": self.portfolio_id,
|
||||||
"created_at": self.timestamp
|
"created_at": self.timestamp
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
|
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=120)
|
||||||
|
|
||||||
# Check if the response status code is 2xx (success)
|
# Check if the response status code is 2xx (success)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
|
||||||
|
|
@ -34,4 +34,5 @@ pytz==2023.3
|
||||||
mip==1.15.0
|
mip==1.15.0
|
||||||
boto3==1.28.3
|
boto3==1.28.3
|
||||||
pandas==1.5.3
|
pandas==1.5.3
|
||||||
pyarrow==12.0.1
|
pyarrow==12.0.1
|
||||||
|
textblob
|
||||||
|
|
@ -1,15 +1,17 @@
|
||||||
import pytest
|
import pytest
|
||||||
import pandas as pd
|
|
||||||
from unittest.mock import Mock
|
from unittest.mock import Mock
|
||||||
from epc_api.client import EpcClient
|
from epc_api.client import EpcClient
|
||||||
from backend.Property import Property
|
from backend.Property import Property
|
||||||
from open_uprn.OpenUprnClient import OpenUprnClient
|
from etl.epc_clean.EpcClean import EpcClean
|
||||||
from model_data.EpcClean import EpcClean
|
|
||||||
|
|
||||||
# Define some test data
|
# Define some test data
|
||||||
mock_epc_response = {
|
mock_epc_response = {
|
||||||
"rows": [
|
"rows": [
|
||||||
{
|
{
|
||||||
|
"lmk-key": 1,
|
||||||
|
"uprn": 1,
|
||||||
|
"number-habitable-rooms": 5,
|
||||||
|
"property-type": "House",
|
||||||
"inspection-date": "2023-06-01",
|
"inspection-date": "2023-06-01",
|
||||||
"some-other-key": "some-value",
|
"some-other-key": "some-value",
|
||||||
"roof-description": "Roof Description",
|
"roof-description": "Roof Description",
|
||||||
|
|
@ -34,6 +36,10 @@ mock_epc_response = {
|
||||||
"construction-age-band": "England and Wales: 1967-1975"
|
"construction-age-band": "England and Wales: 1967-1975"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"lmk-key": 2,
|
||||||
|
"uprn": 2,
|
||||||
|
"number-habitable-rooms": 5,
|
||||||
|
"property-type": "House",
|
||||||
"inspection-date": "2023-05-01",
|
"inspection-date": "2023-05-01",
|
||||||
"some-other-key": "some-other-value",
|
"some-other-key": "some-other-value",
|
||||||
"roof-description": "Roof Description",
|
"roof-description": "Roof Description",
|
||||||
|
|
@ -63,6 +69,10 @@ mock_epc_response = {
|
||||||
mock_epc_response_dupe = {
|
mock_epc_response_dupe = {
|
||||||
'rows': [
|
'rows': [
|
||||||
{
|
{
|
||||||
|
"lmk-key": 1,
|
||||||
|
"uprn": 1,
|
||||||
|
"number-habitable-rooms": 5,
|
||||||
|
"property-type": "House",
|
||||||
'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
|
'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
|
||||||
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
||||||
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
|
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
|
||||||
|
|
@ -83,6 +93,10 @@ mock_epc_response_dupe = {
|
||||||
"construction-age-band": "England and Wales: 1967-1975"
|
"construction-age-band": "England and Wales: 1967-1975"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"lmk-key": 2,
|
||||||
|
"uprn": 2,
|
||||||
|
"number-habitable-rooms": 5,
|
||||||
|
"property-type": "House",
|
||||||
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
|
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
|
||||||
'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
|
'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
|
||||||
'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
|
'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
|
||||||
|
|
@ -104,6 +118,10 @@ mock_epc_response_dupe = {
|
||||||
"construction-age-band": "England and Wales: 1967-1975"
|
"construction-age-band": "England and Wales: 1967-1975"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"lmk-key": 3,
|
||||||
|
"uprn": 3,
|
||||||
|
"number-habitable-rooms": 5,
|
||||||
|
"property-type": "House",
|
||||||
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
|
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
|
||||||
'roof-description': 'Roof Description',
|
'roof-description': 'Roof Description',
|
||||||
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
|
||||||
|
|
@ -130,7 +148,7 @@ mock_epc_response_dupe = {
|
||||||
|
|
||||||
class TestProperty:
|
class TestProperty:
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
|
def property_instance(self, mock_epc_client, mock_cleaner):
|
||||||
property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||||
return property_instance
|
return property_instance
|
||||||
|
|
||||||
|
|
@ -141,29 +159,18 @@ class TestProperty:
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_epc_client(self):
|
def mock_epc_client(self):
|
||||||
mock_epc_client = Mock(spec=EpcClient())
|
mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
|
||||||
mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
|
mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
|
||||||
mock_epc_client.auth_token = "mocked_auth_token"
|
mock_epc_client.auth_token = "mocked_auth_token"
|
||||||
return mock_epc_client
|
return mock_epc_client
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_epc_client_dupe_data(self):
|
def mock_epc_client_dupe_data(self):
|
||||||
mock_epc_client_dupe_data = Mock(spec=EpcClient())
|
mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
|
||||||
mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
|
mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
|
||||||
mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
|
mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
|
||||||
return mock_epc_client_dupe_data
|
return mock_epc_client_dupe_data
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_open_uprn_client(self):
|
|
||||||
mock_open_uprn_client = Mock(spec=OpenUprnClient(path=None, uprns=[12345]))
|
|
||||||
mock_open_uprn_client.data = pd.DataFrame(
|
|
||||||
[
|
|
||||||
{"UPRN": 12345, "longitude": 1.2345, "latitude": 2.3456},
|
|
||||||
{"UPRN": 12346, "longitude": 3.4567, "latitude": 4.5678}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
return mock_open_uprn_client
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_cleaner(self):
|
def mock_cleaner(self):
|
||||||
lighting_averages = [
|
lighting_averages = [
|
||||||
|
|
@ -186,9 +193,22 @@ class TestProperty:
|
||||||
)
|
)
|
||||||
|
|
||||||
mock_cleaner = Mock(spec=cleaner_spec)
|
mock_cleaner = Mock(spec=cleaner_spec)
|
||||||
|
|
||||||
|
walls_data = {
|
||||||
|
"original_description": "Walls Description",
|
||||||
|
"is_cavity_wall": True,
|
||||||
|
"is_solid_brick": False,
|
||||||
|
"is_timber_frame": False,
|
||||||
|
"is_system_built": False,
|
||||||
|
"is_park_home": False,
|
||||||
|
"is_cob": False,
|
||||||
|
"is_sandstone_or_limestone": False,
|
||||||
|
"is_granite_or_whinstone": False,
|
||||||
|
}
|
||||||
|
|
||||||
mock_cleaner.cleaned = {
|
mock_cleaner.cleaned = {
|
||||||
"roof-description": [{"original_description": "Roof Description"}],
|
"roof-description": [{"original_description": "Roof Description"}],
|
||||||
"walls-description": [{"original_description": "Walls Description"}],
|
"walls-description": [walls_data],
|
||||||
"windows-description": [{"original_description": "Windows Description"}],
|
"windows-description": [{"original_description": "Windows Description"}],
|
||||||
"mainheat-description": [{"original_description": "Main Heating Description"}],
|
"mainheat-description": [{"original_description": "Main Heating Description"}],
|
||||||
"hotwater-description": [{"original_description": "Hot Water Description"}],
|
"hotwater-description": [{"original_description": "Hot Water Description"}],
|
||||||
|
|
@ -201,10 +221,10 @@ class TestProperty:
|
||||||
# Should be mocked auth token
|
# Should be mocked auth token
|
||||||
assert inst1.epc_client.auth_token == "mocked_auth_token"
|
assert inst1.epc_client.auth_token == "mocked_auth_token"
|
||||||
|
|
||||||
inst2 = Property(3, "AB12CD", "Test Address")
|
inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||||
assert inst2.epc_client.auth_token
|
assert inst2.epc_client.auth_token
|
||||||
|
|
||||||
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
|
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client)
|
||||||
assert inst3.data == {"some": "data"}
|
assert inst3.data == {"some": "data"}
|
||||||
|
|
||||||
data = inst3.search_address_epc()
|
data = inst3.search_address_epc()
|
||||||
|
|
@ -227,11 +247,23 @@ class TestProperty:
|
||||||
|
|
||||||
# Verify that the components are set correctly
|
# Verify that the components are set correctly
|
||||||
assert property_instance.roof == {"original_description": "Roof Description"}
|
assert property_instance.roof == {"original_description": "Roof Description"}
|
||||||
assert property_instance.walls == {"original_description": "Walls Description"}
|
assert property_instance.walls == {
|
||||||
|
"original_description": "Walls Description",
|
||||||
|
"is_cavity_wall": True,
|
||||||
|
"is_solid_brick": False,
|
||||||
|
"is_timber_frame": False,
|
||||||
|
"is_system_built": False,
|
||||||
|
"is_park_home": False,
|
||||||
|
"is_cob": False,
|
||||||
|
"is_sandstone_or_limestone": False,
|
||||||
|
"is_granite_or_whinstone": False,
|
||||||
|
}
|
||||||
assert property_instance.windows == {"original_description": "Windows Description"}
|
assert property_instance.windows == {"original_description": "Windows Description"}
|
||||||
assert property_instance.main_heating == {"original_description": "Main Heating Description"}
|
assert property_instance.main_heating == {"original_description": "Main Heating Description"}
|
||||||
assert property_instance.hotwater == {"original_description": "Hot Water Description"}
|
assert property_instance.hotwater == {"original_description": "Hot Water Description"}
|
||||||
|
|
||||||
|
assert property_instance.wall_type == "cavity"
|
||||||
|
|
||||||
def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
|
def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
|
||||||
# Modify the mock EpcClean to not have cleaned data
|
# Modify the mock EpcClean to not have cleaned data
|
||||||
mock_cleaner.cleaned = {}
|
mock_cleaner.cleaned = {}
|
||||||
|
|
|
||||||
|
|
@ -1,51 +0,0 @@
|
||||||
"""
|
|
||||||
This application reads in the open uprn data from a static location and loads it into
|
|
||||||
our database for querying from other services
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from conservation_areas.ConservationAreaClient import ConservationAreaClient
|
|
||||||
from datatypes.datatypes import OpenUprnCoordinateData
|
|
||||||
|
|
||||||
|
|
||||||
def app():
|
|
||||||
conservation_area_client = ConservationAreaClient(
|
|
||||||
historic_england_path=os.path.abspath(
|
|
||||||
os.path.dirname(__file__)
|
|
||||||
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
|
|
||||||
gov_path=os.path.abspath(
|
|
||||||
os.path.dirname(__file__)
|
|
||||||
) + "/model_data/local_data/gov-conservation-area.geojson"
|
|
||||||
)
|
|
||||||
conservation_area_client.read()
|
|
||||||
|
|
||||||
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
|
|
||||||
open_uprn_data = [
|
|
||||||
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
|
|
||||||
'LONGITUDE': -0.0540506},
|
|
||||||
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
|
|
||||||
'LONGITUDE': -0.0498772},
|
|
||||||
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
|
|
||||||
'LONGITUDE': -0.226392},
|
|
||||||
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
||||||
'LONGITUDE': -0.0792445},
|
|
||||||
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
|
|
||||||
'LONGITUDE': -0.0792445},
|
|
||||||
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
|
|
||||||
'LONGITUDE': -0.0468833},
|
|
||||||
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
|
|
||||||
'LONGITUDE': -0.1362513},
|
|
||||||
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
|
|
||||||
'LONGITUDE': -0.0823165}
|
|
||||||
]
|
|
||||||
|
|
||||||
result = [
|
|
||||||
{
|
|
||||||
"uprn": coordinates["UPRN"],
|
|
||||||
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
|
|
||||||
OpenUprnCoordinateData(**coordinates))
|
|
||||||
} for coordinates in
|
|
||||||
open_uprn_data
|
|
||||||
]
|
|
||||||
|
|
||||||
# TODO: Add a method to write to the database
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
# Data Collection
|
|
||||||
|
|
||||||
This service is specifically focused on the collection of data external sources which aren't easily
|
|
||||||
accessed via api or via downloadable data sources. For example, wages data requires a specific application to
|
|
||||||
pull that data from websites, e.g. from Adzuna's api
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
"""
|
|
||||||
Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
|
|
||||||
https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
|
|
||||||
-codes-in-the-united-kingdom/explore
|
|
||||||
"""
|
|
||||||
|
|
||||||
constituencies = pd.read_csv(
|
|
||||||
os.path.abspath(
|
|
||||||
os.path.dirname(
|
|
||||||
__file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
|
|
||||||
"December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
|
|
||||||
)
|
|
||||||
|
|
||||||
constituencies["location_type"] = "constituency"
|
|
||||||
|
|
||||||
|
|
||||||
def retry_api_call(job_title, location, max_retries=10):
|
|
||||||
for i in range(max_retries):
|
|
||||||
try:
|
|
||||||
response = get_adzuna_jobs(job_title, location)
|
|
||||||
return response
|
|
||||||
except (requests.HTTPError, requests.ConnectionError):
|
|
||||||
print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
|
|
||||||
time.sleep(2)
|
|
||||||
print(f"Failed after {max_retries} attempts.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_adzuna_jobs(job_title, location):
|
|
||||||
base_url = "https://api.adzuna.com/v1/api/jobs"
|
|
||||||
country_code = "gb"
|
|
||||||
|
|
||||||
url = f"{base_url}/{country_code}/search/1"
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"app_id": ADZUNA_APP_ID,
|
|
||||||
"app_key": ADZUNA_API_KEY,
|
|
||||||
"results_per_page": 25,
|
|
||||||
"what": job_title,
|
|
||||||
"where": location,
|
|
||||||
"content-type": "application/json",
|
|
||||||
"distance": 10
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(url, params=params)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
jobs = json.loads(response.text)
|
|
||||||
return jobs
|
|
||||||
|
|
||||||
|
|
||||||
JOB_TITLES = [
|
|
||||||
"insulation installer", "internal wall insulation installer", "external wall insulation installer",
|
|
||||||
"cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
|
|
||||||
"spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
|
|
||||||
"iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
|
|
||||||
]
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for i, job_title in enumerate(JOB_TITLES):
|
|
||||||
print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
|
|
||||||
for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
|
|
||||||
|
|
||||||
location = location_config["PCON22NM"]
|
|
||||||
jobs = retry_api_call(job_title, location)
|
|
||||||
time.sleep(0.5)
|
|
||||||
if jobs["results"]:
|
|
||||||
for job in jobs['results']:
|
|
||||||
to_append = {
|
|
||||||
"job_title": job_title,
|
|
||||||
"search_location": location,
|
|
||||||
"search_location_code": location_config["PCON22CD"],
|
|
||||||
**job
|
|
||||||
}
|
|
||||||
results.append(to_append)
|
|
||||||
|
|
||||||
results_df = pd.DataFrame(results)
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv(dotenv_path='data_collection/.env')
|
|
||||||
|
|
||||||
ADZUNA_API_KEY = os.environ.get('ADZUNA_API_KEY')
|
|
||||||
ADZUNA_APP_ID = os.environ.get('ADZUNA_APP_ID')
|
|
||||||
BIN
data_collection/data/.DS_Store
vendored
BIN
data_collection/data/.DS_Store
vendored
Binary file not shown.
|
|
@ -1,651 +0,0 @@
|
||||||
PCON22CD,PCON22NM,ObjectId
|
|
||||||
E14000530,Aldershot,1
|
|
||||||
E14000531,Aldridge-Brownhills,2
|
|
||||||
E14000532,Altrincham and Sale West,3
|
|
||||||
E14000533,Amber Valley,4
|
|
||||||
E14000534,Arundel and South Downs,5
|
|
||||||
E14000535,Ashfield,6
|
|
||||||
E14000536,Ashford,7
|
|
||||||
E14000537,Ashton-under-Lyne,8
|
|
||||||
E14000538,Aylesbury,9
|
|
||||||
E14000539,Banbury,10
|
|
||||||
E14000540,Barking,11
|
|
||||||
E14000541,Barnsley Central,12
|
|
||||||
E14000542,Barnsley East,13
|
|
||||||
E14000543,Barrow and Furness,14
|
|
||||||
E14000544,Basildon and Billericay,15
|
|
||||||
E14000545,Basingstoke,16
|
|
||||||
E14000546,Bassetlaw,17
|
|
||||||
E14000547,Bath,18
|
|
||||||
E14000548,Batley and Spen,19
|
|
||||||
E14000549,Battersea,20
|
|
||||||
E14000550,Beaconsfield,21
|
|
||||||
E14000551,Beckenham,22
|
|
||||||
E14000552,Bedford,23
|
|
||||||
E14000553,Bermondsey and Old Southwark,24
|
|
||||||
E14000554,Berwick-upon-Tweed,25
|
|
||||||
E14000555,Bethnal Green and Bow,26
|
|
||||||
E14000556,Beverley and Holderness,27
|
|
||||||
E14000557,Bexhill and Battle,28
|
|
||||||
E14000558,Bexleyheath and Crayford,29
|
|
||||||
E14000559,Birkenhead,30
|
|
||||||
E14000560,"Birmingham, Edgbaston",31
|
|
||||||
E14000561,"Birmingham, Erdington",32
|
|
||||||
E14000562,"Birmingham, Hall Green",33
|
|
||||||
E14000563,"Birmingham, Hodge Hill",34
|
|
||||||
E14000564,"Birmingham, Ladywood",35
|
|
||||||
E14000565,"Birmingham, Northfield",36
|
|
||||||
E14000566,"Birmingham, Perry Barr",37
|
|
||||||
E14000567,"Birmingham, Selly Oak",38
|
|
||||||
E14000568,"Birmingham, Yardley",39
|
|
||||||
E14000569,Bishop Auckland,40
|
|
||||||
E14000570,Blackburn,41
|
|
||||||
E14000571,Blackley and Broughton,42
|
|
||||||
E14000572,Blackpool North and Cleveleys,43
|
|
||||||
E14000573,Blackpool South,44
|
|
||||||
E14000574,Blaydon,45
|
|
||||||
E14000575,Blyth Valley,46
|
|
||||||
E14000576,Bognor Regis and Littlehampton,47
|
|
||||||
E14000577,Bolsover,48
|
|
||||||
E14000578,Bolton North East,49
|
|
||||||
E14000579,Bolton South East,50
|
|
||||||
E14000830,Newbury,51
|
|
||||||
E14000831,Newcastle upon Tyne Central,52
|
|
||||||
E14000832,Newcastle upon Tyne East,53
|
|
||||||
E14000833,Newcastle upon Tyne North,54
|
|
||||||
E14000834,Newcastle-under-Lyme,55
|
|
||||||
E14000835,Newton Abbot,56
|
|
||||||
E14000836,"Normanton, Pontefract and Castleford",57
|
|
||||||
E14000837,North Cornwall,58
|
|
||||||
E14000838,North Devon,59
|
|
||||||
E14000839,North Dorset,60
|
|
||||||
E14000840,North Durham,61
|
|
||||||
E14000841,North East Bedfordshire,62
|
|
||||||
E14000842,North East Cambridgeshire,63
|
|
||||||
E14000843,North East Derbyshire,64
|
|
||||||
E14000844,North East Hampshire,65
|
|
||||||
E14000845,North East Hertfordshire,66
|
|
||||||
E14000846,North East Somerset,67
|
|
||||||
E14000847,North Herefordshire,68
|
|
||||||
E14000848,North Norfolk,69
|
|
||||||
E14000849,North Shropshire,70
|
|
||||||
E14000850,North Somerset,71
|
|
||||||
E14000851,North Swindon,72
|
|
||||||
E14000852,North Thanet,73
|
|
||||||
E14000853,North Tyneside,74
|
|
||||||
E14000854,North Warwickshire,75
|
|
||||||
E14000855,North West Cambridgeshire,76
|
|
||||||
E14000856,North West Durham,77
|
|
||||||
E14000857,North West Hampshire,78
|
|
||||||
E14000858,North West Leicestershire,79
|
|
||||||
E14000859,North West Norfolk,80
|
|
||||||
E14000860,North Wiltshire,81
|
|
||||||
E14000861,Northampton North,82
|
|
||||||
E14000862,Northampton South,83
|
|
||||||
E14000863,Norwich North,84
|
|
||||||
E14000864,Norwich South,85
|
|
||||||
E14000865,Nottingham East,86
|
|
||||||
E14000866,Nottingham North,87
|
|
||||||
E14000867,Nottingham South,88
|
|
||||||
E14000868,Nuneaton,89
|
|
||||||
E14000869,Old Bexley and Sidcup,90
|
|
||||||
E14000870,Oldham East and Saddleworth,91
|
|
||||||
E14000871,Oldham West and Royton,92
|
|
||||||
E14000872,Orpington,93
|
|
||||||
E14000873,Oxford East,94
|
|
||||||
E14000874,Oxford West and Abingdon,95
|
|
||||||
E14000875,Pendle,96
|
|
||||||
E14000876,Penistone and Stocksbridge,97
|
|
||||||
E14000877,Penrith and The Border,98
|
|
||||||
E14000878,Peterborough,99
|
|
||||||
E14000879,"Plymouth, Moor View",100
|
|
||||||
E14000580,Bolton West,101
|
|
||||||
E14000581,Bootle,102
|
|
||||||
E14000582,Boston and Skegness,103
|
|
||||||
E14000583,Bosworth,104
|
|
||||||
E14000584,Bournemouth East,105
|
|
||||||
E14000585,Bournemouth West,106
|
|
||||||
E14000586,Bracknell,107
|
|
||||||
E14000587,Bradford East,108
|
|
||||||
E14000588,Bradford South,109
|
|
||||||
E14000589,Bradford West,110
|
|
||||||
E14000590,Braintree,111
|
|
||||||
E14000591,Brent Central,112
|
|
||||||
E14000592,Brent North,113
|
|
||||||
E14000593,Brentford and Isleworth,114
|
|
||||||
E14000594,Brentwood and Ongar,115
|
|
||||||
E14000595,Bridgwater and West Somerset,116
|
|
||||||
E14000596,Brigg and Goole,117
|
|
||||||
E14000597,"Brighton, Kemptown",118
|
|
||||||
E14000598,"Brighton, Pavilion",119
|
|
||||||
E14000599,Bristol East,120
|
|
||||||
E14000600,Bristol North West,121
|
|
||||||
E14000601,Bristol South,122
|
|
||||||
E14000602,Bristol West,123
|
|
||||||
E14000603,Broadland,124
|
|
||||||
E14000604,Bromley and Chislehurst,125
|
|
||||||
E14000605,Bromsgrove,126
|
|
||||||
E14000606,Broxbourne,127
|
|
||||||
E14000607,Broxtowe,128
|
|
||||||
E14000608,Buckingham,129
|
|
||||||
E14000609,Burnley,130
|
|
||||||
E14000610,Burton,131
|
|
||||||
E14000611,Bury North,132
|
|
||||||
E14000612,Bury South,133
|
|
||||||
E14000613,Bury St Edmunds,134
|
|
||||||
E14000614,Calder Valley,135
|
|
||||||
E14000615,Camberwell and Peckham,136
|
|
||||||
E14000616,Camborne and Redruth,137
|
|
||||||
E14000617,Cambridge,138
|
|
||||||
E14000618,Cannock Chase,139
|
|
||||||
E14000619,Canterbury,140
|
|
||||||
E14000620,Carlisle,141
|
|
||||||
E14000621,Carshalton and Wallington,142
|
|
||||||
E14000622,Castle Point,143
|
|
||||||
E14000623,Central Devon,144
|
|
||||||
E14000624,Central Suffolk and North Ipswich,145
|
|
||||||
E14000625,Charnwood,146
|
|
||||||
E14000626,Chatham and Aylesford,147
|
|
||||||
E14000627,Cheadle,148
|
|
||||||
E14000628,Chelmsford,149
|
|
||||||
E14000629,Chelsea and Fulham,150
|
|
||||||
E14000630,Cheltenham,151
|
|
||||||
E14000631,Chesham and Amersham,152
|
|
||||||
E14000632,Chesterfield,153
|
|
||||||
E14000633,Chichester,154
|
|
||||||
E14000634,Chingford and Woodford Green,155
|
|
||||||
E14000635,Chippenham,156
|
|
||||||
E14000636,Chipping Barnet,157
|
|
||||||
E14000637,Chorley,158
|
|
||||||
E14000638,Christchurch,159
|
|
||||||
E14000639,Cities of London and Westminster,160
|
|
||||||
E14000640,City of Chester,161
|
|
||||||
E14000641,City of Durham,162
|
|
||||||
E14000642,Clacton,163
|
|
||||||
E14000643,Cleethorpes,164
|
|
||||||
E14000644,Colchester,165
|
|
||||||
E14000645,Colne Valley,166
|
|
||||||
E14000646,Congleton,167
|
|
||||||
E14000647,Copeland,168
|
|
||||||
E14000648,Corby,169
|
|
||||||
E14000649,Coventry North East,170
|
|
||||||
E14000650,Coventry North West,171
|
|
||||||
E14000651,Coventry South,172
|
|
||||||
E14000652,Crawley,173
|
|
||||||
E14000653,Crewe and Nantwich,174
|
|
||||||
E14000654,Croydon Central,175
|
|
||||||
E14000655,Croydon North,176
|
|
||||||
E14000656,Croydon South,177
|
|
||||||
E14000657,Dagenham and Rainham,178
|
|
||||||
E14000658,Darlington,179
|
|
||||||
E14000659,Dartford,180
|
|
||||||
E14000660,Daventry,181
|
|
||||||
E14000661,Denton and Reddish,182
|
|
||||||
E14000662,Derby North,183
|
|
||||||
E14000663,Derby South,184
|
|
||||||
E14000664,Derbyshire Dales,185
|
|
||||||
E14000665,Devizes,186
|
|
||||||
E14000666,Dewsbury,187
|
|
||||||
E14000667,Don Valley,188
|
|
||||||
E14000668,Doncaster Central,189
|
|
||||||
E14000669,Doncaster North,190
|
|
||||||
E14000670,Dover,191
|
|
||||||
E14000671,Dudley North,192
|
|
||||||
E14000672,Dudley South,193
|
|
||||||
E14000673,Dulwich and West Norwood,194
|
|
||||||
E14000674,Ealing Central and Acton,195
|
|
||||||
E14000675,Ealing North,196
|
|
||||||
E14000676,"Ealing, Southall",197
|
|
||||||
E14000677,Easington,198
|
|
||||||
E14000678,East Devon,199
|
|
||||||
E14000679,East Ham,200
|
|
||||||
E14000780,Leeds North West,201
|
|
||||||
E14000781,Leeds West,202
|
|
||||||
E14000782,Leicester East,203
|
|
||||||
E14000783,Leicester South,204
|
|
||||||
E14000784,Leicester West,205
|
|
||||||
E14000785,Leigh,206
|
|
||||||
E14000786,Lewes,207
|
|
||||||
E14000787,Lewisham East,208
|
|
||||||
E14000788,Lewisham West and Penge,209
|
|
||||||
E14000789,"Lewisham, Deptford",210
|
|
||||||
E14000790,Leyton and Wanstead,211
|
|
||||||
E14000791,Lichfield,212
|
|
||||||
E14000792,Lincoln,213
|
|
||||||
E14000793,"Liverpool, Riverside",214
|
|
||||||
E14000794,"Liverpool, Walton",215
|
|
||||||
E14000795,"Liverpool, Wavertree",216
|
|
||||||
E14000796,"Liverpool, West Derby",217
|
|
||||||
E14000797,Loughborough,218
|
|
||||||
E14000798,Louth and Horncastle,219
|
|
||||||
E14000799,Ludlow,220
|
|
||||||
E14000800,Luton North,221
|
|
||||||
E14000801,Luton South,222
|
|
||||||
E14000802,Macclesfield,223
|
|
||||||
E14000803,Maidenhead,224
|
|
||||||
E14000804,Maidstone and The Weald,225
|
|
||||||
E14000805,Makerfield,226
|
|
||||||
E14000806,Maldon,227
|
|
||||||
E14000807,Manchester Central,228
|
|
||||||
E14000808,"Manchester, Gorton",229
|
|
||||||
E14000809,"Manchester, Withington",230
|
|
||||||
E14000810,Mansfield,231
|
|
||||||
E14000811,Meon Valley,232
|
|
||||||
E14000812,Meriden,233
|
|
||||||
E14000813,Mid Bedfordshire,234
|
|
||||||
E14000814,Mid Derbyshire,235
|
|
||||||
E14000815,Mid Dorset and North Poole,236
|
|
||||||
E14000816,Mid Norfolk,237
|
|
||||||
E14000817,Mid Sussex,238
|
|
||||||
E14000818,Mid Worcestershire,239
|
|
||||||
E14000819,Middlesbrough,240
|
|
||||||
E14000820,Middlesbrough South and East Cleveland,241
|
|
||||||
E14000821,Milton Keynes North,242
|
|
||||||
E14000822,Milton Keynes South,243
|
|
||||||
E14000823,Mitcham and Morden,244
|
|
||||||
E14000824,Mole Valley,245
|
|
||||||
E14000825,Morecambe and Lunesdale,246
|
|
||||||
E14000826,Morley and Outwood,247
|
|
||||||
E14000827,New Forest East,248
|
|
||||||
E14000828,New Forest West,249
|
|
||||||
E14000829,Newark,250
|
|
||||||
E14000680,East Hampshire,251
|
|
||||||
E14000681,East Surrey,252
|
|
||||||
E14000682,East Worthing and Shoreham,253
|
|
||||||
E14000683,East Yorkshire,254
|
|
||||||
E14000880,"Plymouth, Sutton and Devonport",255
|
|
||||||
E14000684,Eastbourne,256
|
|
||||||
E14000685,Eastleigh,257
|
|
||||||
E14000881,Poole,258
|
|
||||||
E14000686,Eddisbury,259
|
|
||||||
E14000882,Poplar and Limehouse,260
|
|
||||||
E14000687,Edmonton,261
|
|
||||||
E14000883,Portsmouth North,262
|
|
||||||
E14000688,Ellesmere Port and Neston,263
|
|
||||||
E14000884,Portsmouth South,264
|
|
||||||
E14000689,Elmet and Rothwell,265
|
|
||||||
E14000885,Preston,266
|
|
||||||
E14000690,Eltham,267
|
|
||||||
E14000886,Pudsey,268
|
|
||||||
E14000691,Enfield North,269
|
|
||||||
E14000887,Putney,270
|
|
||||||
E14000692,"Enfield, Southgate",271
|
|
||||||
E14000888,Rayleigh and Wickford,272
|
|
||||||
E14000693,Epping Forest,273
|
|
||||||
E14000889,Reading East,274
|
|
||||||
E14000694,Epsom and Ewell,275
|
|
||||||
E14000890,Reading West,276
|
|
||||||
E14000695,Erewash,277
|
|
||||||
E14000891,Redcar,278
|
|
||||||
E14000696,Erith and Thamesmead,279
|
|
||||||
E14000892,Redditch,280
|
|
||||||
E14000697,Esher and Walton,281
|
|
||||||
E14000893,Reigate,282
|
|
||||||
E14000698,Exeter,283
|
|
||||||
E14000894,Ribble Valley,284
|
|
||||||
E14000699,Fareham,285
|
|
||||||
E14000895,Richmond (Yorks),286
|
|
||||||
E14000700,Faversham and Mid Kent,287
|
|
||||||
E14000896,Richmond Park,288
|
|
||||||
E14000701,Feltham and Heston,289
|
|
||||||
E14000897,Rochdale,290
|
|
||||||
E14000702,Filton and Bradley Stoke,291
|
|
||||||
E14000898,Rochester and Strood,292
|
|
||||||
E14000703,Finchley and Golders Green,293
|
|
||||||
E14000899,Rochford and Southend East,294
|
|
||||||
E14000704,Folkestone and Hythe,295
|
|
||||||
E14000900,Romford,296
|
|
||||||
E14000705,Forest of Dean,297
|
|
||||||
E14000901,Romsey and Southampton North,298
|
|
||||||
E14000706,Fylde,299
|
|
||||||
E14000902,Rossendale and Darwen,300
|
|
||||||
E14000707,Gainsborough,301
|
|
||||||
E14000903,Rother Valley,302
|
|
||||||
E14000904,Rotherham,303
|
|
||||||
E14000905,Rugby,304
|
|
||||||
E14000906,"Ruislip, Northwood and Pinner",305
|
|
||||||
E14000907,Runnymede and Weybridge,306
|
|
||||||
E14000908,Rushcliffe,307
|
|
||||||
E14000909,Rutland and Melton,308
|
|
||||||
E14000910,Saffron Walden,309
|
|
||||||
E14000911,Salford and Eccles,310
|
|
||||||
E14000912,Salisbury,311
|
|
||||||
E14000913,Scarborough and Whitby,312
|
|
||||||
E14000914,Scunthorpe,313
|
|
||||||
E14000915,Sedgefield,314
|
|
||||||
E14000916,Sefton Central,315
|
|
||||||
E14000917,Selby and Ainsty,316
|
|
||||||
E14000918,Sevenoaks,317
|
|
||||||
E14000919,Sheffield Central,318
|
|
||||||
E14000920,Sheffield South East,319
|
|
||||||
E14000921,"Sheffield, Brightside and Hillsborough",320
|
|
||||||
E14000922,"Sheffield, Hallam",321
|
|
||||||
E14000923,"Sheffield, Heeley",322
|
|
||||||
E14000924,Sherwood,323
|
|
||||||
E14000925,Shipley,324
|
|
||||||
E14000926,Shrewsbury and Atcham,325
|
|
||||||
E14000927,Sittingbourne and Sheppey,326
|
|
||||||
E14000928,Skipton and Ripon,327
|
|
||||||
E14000929,Sleaford and North Hykeham,328
|
|
||||||
E14000730,Harrogate and Knaresborough,329
|
|
||||||
E14000731,Harrow East,330
|
|
||||||
E14000732,Harrow West,331
|
|
||||||
E14000733,Hartlepool,332
|
|
||||||
E14000734,Harwich and North Essex,333
|
|
||||||
E14000735,Hastings and Rye,334
|
|
||||||
E14000736,Havant,335
|
|
||||||
E14000737,Hayes and Harlington,336
|
|
||||||
E14000738,Hazel Grove,337
|
|
||||||
E14000739,Hemel Hempstead,338
|
|
||||||
E14000740,Hemsworth,339
|
|
||||||
E14000741,Hendon,340
|
|
||||||
E14000742,Henley,341
|
|
||||||
E14000743,Hereford and South Herefordshire,342
|
|
||||||
E14000744,Hertford and Stortford,343
|
|
||||||
E14000745,Hertsmere,344
|
|
||||||
E14000746,Hexham,345
|
|
||||||
E14000747,Heywood and Middleton,346
|
|
||||||
E14000748,High Peak,347
|
|
||||||
E14000749,Hitchin and Harpenden,348
|
|
||||||
E14000750,Holborn and St Pancras,349
|
|
||||||
E14000751,Hornchurch and Upminster,350
|
|
||||||
E14000752,Hornsey and Wood Green,351
|
|
||||||
E14000753,Horsham,352
|
|
||||||
E14000754,Houghton and Sunderland South,353
|
|
||||||
E14000755,Hove,354
|
|
||||||
E14000756,Huddersfield,355
|
|
||||||
E14000757,Huntingdon,356
|
|
||||||
E14000758,Hyndburn,357
|
|
||||||
E14000759,Ilford North,358
|
|
||||||
E14000760,Ilford South,359
|
|
||||||
E14000761,Ipswich,360
|
|
||||||
E14000762,Isle of Wight,361
|
|
||||||
E14000763,Islington North,362
|
|
||||||
E14000764,Islington South and Finsbury,363
|
|
||||||
E14000765,Jarrow,364
|
|
||||||
E14000766,Keighley,365
|
|
||||||
E14000767,Kenilworth and Southam,366
|
|
||||||
E14000768,Kensington,367
|
|
||||||
E14000769,Kettering,368
|
|
||||||
E14000770,Kingston and Surbiton,369
|
|
||||||
E14000771,Kingston upon Hull East,370
|
|
||||||
E14000772,Kingston upon Hull North,371
|
|
||||||
E14000773,Kingston upon Hull West and Hessle,372
|
|
||||||
E14000774,Kingswood,373
|
|
||||||
E14000775,Knowsley,374
|
|
||||||
E14000776,Lancaster and Fleetwood,375
|
|
||||||
E14000777,Leeds Central,376
|
|
||||||
E14000778,Leeds East,377
|
|
||||||
E14000779,Leeds North East,378
|
|
||||||
E14000708,Garston and Halewood,379
|
|
||||||
E14000709,Gateshead,380
|
|
||||||
E14000710,Gedling,381
|
|
||||||
E14000711,Gillingham and Rainham,382
|
|
||||||
E14000712,Gloucester,383
|
|
||||||
E14000713,Gosport,384
|
|
||||||
E14000714,Grantham and Stamford,385
|
|
||||||
E14000715,Gravesham,386
|
|
||||||
E14000716,Great Grimsby,387
|
|
||||||
E14000717,Great Yarmouth,388
|
|
||||||
E14000718,Greenwich and Woolwich,389
|
|
||||||
E14000719,Guildford,390
|
|
||||||
E14000720,Hackney North and Stoke Newington,391
|
|
||||||
E14000721,Hackney South and Shoreditch,392
|
|
||||||
E14000722,Halesowen and Rowley Regis,393
|
|
||||||
E14000723,Halifax,394
|
|
||||||
E14000724,Haltemprice and Howden,395
|
|
||||||
E14000725,Halton,396
|
|
||||||
E14000726,Hammersmith,397
|
|
||||||
E14000727,Hampstead and Kilburn,398
|
|
||||||
E14000728,Harborough,399
|
|
||||||
E14000729,Harlow,400
|
|
||||||
E14000930,Slough,401
|
|
||||||
E14000931,Solihull,402
|
|
||||||
E14000932,Somerton and Frome,403
|
|
||||||
E14000933,South Basildon and East Thurrock,404
|
|
||||||
E14000934,South Cambridgeshire,405
|
|
||||||
E14000935,South Derbyshire,406
|
|
||||||
E14000936,South Dorset,407
|
|
||||||
E14000937,South East Cambridgeshire,408
|
|
||||||
E14000938,South East Cornwall,409
|
|
||||||
E14000939,South Holland and The Deepings,410
|
|
||||||
E14000940,South Leicestershire,411
|
|
||||||
E14000941,South Norfolk,412
|
|
||||||
E14000942,South Northamptonshire,413
|
|
||||||
E14000943,South Ribble,414
|
|
||||||
E14000944,South Shields,415
|
|
||||||
E14000945,South Staffordshire,416
|
|
||||||
E14000946,South Suffolk,417
|
|
||||||
E14000947,South Swindon,418
|
|
||||||
E14000948,South Thanet,419
|
|
||||||
E14000949,South West Bedfordshire,420
|
|
||||||
E14000950,South West Devon,421
|
|
||||||
E14000951,South West Hertfordshire,422
|
|
||||||
E14000952,South West Norfolk,423
|
|
||||||
E14000953,South West Surrey,424
|
|
||||||
E14000954,South West Wiltshire,425
|
|
||||||
E14000955,"Southampton, Itchen",426
|
|
||||||
E14000956,"Southampton, Test",427
|
|
||||||
E14000957,Southend West,428
|
|
||||||
E14000958,Southport,429
|
|
||||||
E14000959,Spelthorne,430
|
|
||||||
E14000960,St Albans,431
|
|
||||||
E14000961,St Austell and Newquay,432
|
|
||||||
E14000962,St Helens North,433
|
|
||||||
E14000963,St Helens South and Whiston,434
|
|
||||||
E14000964,St Ives,435
|
|
||||||
E14000965,Stafford,436
|
|
||||||
E14000966,Staffordshire Moorlands,437
|
|
||||||
E14000967,Stalybridge and Hyde,438
|
|
||||||
E14000968,Stevenage,439
|
|
||||||
E14000969,Stockport,440
|
|
||||||
E14000970,Stockton North,441
|
|
||||||
E14000971,Stockton South,442
|
|
||||||
E14000972,Stoke-on-Trent Central,443
|
|
||||||
E14000973,Stoke-on-Trent North,444
|
|
||||||
E14000974,Stoke-on-Trent South,445
|
|
||||||
E14000975,Stone,446
|
|
||||||
E14000976,Stourbridge,447
|
|
||||||
E14000977,Stratford-on-Avon,448
|
|
||||||
E14000978,Streatham,449
|
|
||||||
E14000979,Stretford and Urmston,450
|
|
||||||
E14000980,Stroud,451
|
|
||||||
E14000981,Suffolk Coastal,452
|
|
||||||
E14000982,Sunderland Central,453
|
|
||||||
E14000983,Surrey Heath,454
|
|
||||||
E14000984,Sutton and Cheam,455
|
|
||||||
E14000985,Sutton Coldfield,456
|
|
||||||
E14000986,Tamworth,457
|
|
||||||
E14000987,Tatton,458
|
|
||||||
E14000988,Taunton Deane,459
|
|
||||||
E14000989,Telford,460
|
|
||||||
E14000990,Tewkesbury,461
|
|
||||||
E14000991,The Cotswolds,462
|
|
||||||
E14000992,The Wrekin,463
|
|
||||||
E14000993,Thirsk and Malton,464
|
|
||||||
E14000994,Thornbury and Yate,465
|
|
||||||
E14000995,Thurrock,466
|
|
||||||
E14000996,Tiverton and Honiton,467
|
|
||||||
E14000997,Tonbridge and Malling,468
|
|
||||||
E14000998,Tooting,469
|
|
||||||
E14000999,Torbay,470
|
|
||||||
E14001000,Torridge and West Devon,471
|
|
||||||
E14001001,Totnes,472
|
|
||||||
E14001002,Tottenham,473
|
|
||||||
E14001003,Truro and Falmouth,474
|
|
||||||
E14001004,Tunbridge Wells,475
|
|
||||||
E14001005,Twickenham,476
|
|
||||||
E14001006,Tynemouth,477
|
|
||||||
E14001007,Uxbridge and South Ruislip,478
|
|
||||||
E14001008,Vauxhall,479
|
|
||||||
E14001009,Wakefield,480
|
|
||||||
E14001010,Wallasey,481
|
|
||||||
E14001011,Walsall North,482
|
|
||||||
E14001012,Walsall South,483
|
|
||||||
E14001013,Walthamstow,484
|
|
||||||
E14001014,Wansbeck,485
|
|
||||||
E14001015,Wantage,486
|
|
||||||
E14001016,Warley,487
|
|
||||||
E14001017,Warrington North,488
|
|
||||||
E14001018,Warrington South,489
|
|
||||||
E14001019,Warwick and Leamington,490
|
|
||||||
E14001020,Washington and Sunderland West,491
|
|
||||||
E14001021,Watford,492
|
|
||||||
E14001022,Waveney,493
|
|
||||||
E14001023,Wealden,494
|
|
||||||
E14001024,Weaver Vale,495
|
|
||||||
E14001025,Wellingborough,496
|
|
||||||
E14001026,Wells,497
|
|
||||||
E14001027,Welwyn Hatfield,498
|
|
||||||
E14001028,Wentworth and Dearne,499
|
|
||||||
E14001029,West Bromwich East,500
|
|
||||||
E14001030,West Bromwich West,501
|
|
||||||
E14001031,West Dorset,502
|
|
||||||
E14001032,West Ham,503
|
|
||||||
E14001033,West Lancashire,504
|
|
||||||
E14001034,West Suffolk,505
|
|
||||||
E14001035,West Worcestershire,506
|
|
||||||
E14001036,Westminster North,507
|
|
||||||
E14001037,Westmorland and Lonsdale,508
|
|
||||||
E14001038,Weston-Super-Mare,509
|
|
||||||
E14001039,Wigan,510
|
|
||||||
E14001040,Wimbledon,511
|
|
||||||
E14001041,Winchester,512
|
|
||||||
E14001042,Windsor,513
|
|
||||||
E14001043,Wirral South,514
|
|
||||||
E14001044,Wirral West,515
|
|
||||||
E14001045,Witham,516
|
|
||||||
E14001046,Witney,517
|
|
||||||
E14001047,Woking,518
|
|
||||||
E14001048,Wokingham,519
|
|
||||||
E14001049,Wolverhampton North East,520
|
|
||||||
E14001050,Wolverhampton South East,521
|
|
||||||
E14001051,Wolverhampton South West,522
|
|
||||||
E14001052,Worcester,523
|
|
||||||
E14001053,Workington,524
|
|
||||||
E14001054,Worsley and Eccles South,525
|
|
||||||
E14001055,Worthing West,526
|
|
||||||
E14001056,Wycombe,527
|
|
||||||
E14001057,Wyre and Preston North,528
|
|
||||||
E14001058,Wyre Forest,529
|
|
||||||
E14001059,Wythenshawe and Sale East,530
|
|
||||||
E14001060,Yeovil,531
|
|
||||||
E14001061,York Central,532
|
|
||||||
E14001062,York Outer,533
|
|
||||||
N06000001,Belfast East,534
|
|
||||||
N06000002,Belfast North,535
|
|
||||||
N06000003,Belfast South,536
|
|
||||||
N06000004,Belfast West,537
|
|
||||||
N06000005,East Antrim,538
|
|
||||||
N06000006,East Londonderry,539
|
|
||||||
N06000007,Fermanagh and South Tyrone,540
|
|
||||||
N06000008,Foyle,541
|
|
||||||
N06000009,Lagan Valley,542
|
|
||||||
N06000010,Mid Ulster,543
|
|
||||||
N06000011,Newry and Armagh,544
|
|
||||||
N06000012,North Antrim,545
|
|
||||||
N06000013,North Down,546
|
|
||||||
N06000014,South Antrim,547
|
|
||||||
N06000015,South Down,548
|
|
||||||
N06000016,Strangford,549
|
|
||||||
N06000017,Upper Bann,550
|
|
||||||
S14000050,Ochil and South Perthshire,551
|
|
||||||
S14000051,Orkney and Shetland,552
|
|
||||||
S14000052,Paisley and Renfrewshire North,553
|
|
||||||
S14000053,Paisley and Renfrewshire South,554
|
|
||||||
S14000054,Perth and North Perthshire,555
|
|
||||||
S14000055,"Ross, Skye and Lochaber",556
|
|
||||||
S14000056,Rutherglen and Hamilton West,557
|
|
||||||
S14000057,Stirling,558
|
|
||||||
S14000058,West Aberdeenshire and Kincardine,559
|
|
||||||
S14000059,West Dunbartonshire,560
|
|
||||||
W07000041,Ynys Môn,561
|
|
||||||
W07000042,Delyn,562
|
|
||||||
W07000043,Alyn and Deeside,563
|
|
||||||
W07000044,Wrexham,564
|
|
||||||
W07000045,Llanelli,565
|
|
||||||
W07000046,Gower,566
|
|
||||||
W07000047,Swansea West,567
|
|
||||||
W07000048,Swansea East,568
|
|
||||||
W07000049,Aberavon,569
|
|
||||||
W07000050,Cardiff Central,570
|
|
||||||
W07000051,Cardiff North,571
|
|
||||||
W07000052,Rhondda,572
|
|
||||||
W07000053,Torfaen,573
|
|
||||||
W07000054,Monmouth,574
|
|
||||||
W07000055,Newport East,575
|
|
||||||
W07000056,Newport West,576
|
|
||||||
W07000057,Arfon,577
|
|
||||||
W07000058,Aberconwy,578
|
|
||||||
W07000059,Clwyd West,579
|
|
||||||
W07000060,Vale of Clwyd,580
|
|
||||||
W07000061,Dwyfor Meirionnydd,581
|
|
||||||
W07000062,Clwyd South,582
|
|
||||||
W07000063,Montgomeryshire,583
|
|
||||||
W07000064,Ceredigion,584
|
|
||||||
W07000065,Preseli Pembrokeshire,585
|
|
||||||
W07000066,Carmarthen West and South Pembrokeshire,586
|
|
||||||
W07000067,Carmarthen East and Dinefwr,587
|
|
||||||
W07000068,Brecon and Radnorshire,588
|
|
||||||
W07000069,Neath,589
|
|
||||||
W07000070,Cynon Valley,590
|
|
||||||
W07000071,Merthyr Tydfil and Rhymney,591
|
|
||||||
W07000072,Blaenau Gwent,592
|
|
||||||
W07000073,Bridgend,593
|
|
||||||
W07000074,Ogmore,594
|
|
||||||
W07000075,Pontypridd,595
|
|
||||||
W07000076,Caerphilly,596
|
|
||||||
W07000077,Islwyn,597
|
|
||||||
W07000078,Vale of Glamorgan,598
|
|
||||||
W07000079,Cardiff West,599
|
|
||||||
W07000080,Cardiff South and Penarth,600
|
|
||||||
N06000018,West Tyrone,601
|
|
||||||
S14000001,Aberdeen North,602
|
|
||||||
S14000002,Aberdeen South,603
|
|
||||||
S14000003,Airdrie and Shotts,604
|
|
||||||
S14000004,Angus,605
|
|
||||||
S14000005,Argyll and Bute,606
|
|
||||||
S14000006,"Ayr, Carrick and Cumnock",607
|
|
||||||
S14000007,Banff and Buchan,608
|
|
||||||
S14000008,"Berwickshire, Roxburgh and Selkirk",609
|
|
||||||
S14000009,"Caithness, Sutherland and Easter Ross",610
|
|
||||||
S14000010,Central Ayrshire,611
|
|
||||||
S14000011,"Coatbridge, Chryston and Bellshill",612
|
|
||||||
S14000012,"Cumbernauld, Kilsyth and Kirkintilloch East",613
|
|
||||||
S14000013,Dumfries and Galloway,614
|
|
||||||
S14000014,"Dumfriesshire, Clydesdale and Tweeddale",615
|
|
||||||
S14000015,Dundee East,616
|
|
||||||
S14000016,Dundee West,617
|
|
||||||
S14000017,Dunfermline and West Fife,618
|
|
||||||
S14000018,East Dunbartonshire,619
|
|
||||||
S14000019,"East Kilbride, Strathaven and Lesmahagow",620
|
|
||||||
S14000020,East Lothian,621
|
|
||||||
S14000021,East Renfrewshire,622
|
|
||||||
S14000022,Edinburgh East,623
|
|
||||||
S14000023,Edinburgh North and Leith,624
|
|
||||||
S14000024,Edinburgh South,625
|
|
||||||
S14000025,Edinburgh South West,626
|
|
||||||
S14000026,Edinburgh West,627
|
|
||||||
S14000027,Na h-Eileanan an Iar,628
|
|
||||||
S14000028,Falkirk,629
|
|
||||||
S14000029,Glasgow Central,630
|
|
||||||
S14000030,Glasgow East,631
|
|
||||||
S14000031,Glasgow North,632
|
|
||||||
S14000032,Glasgow North East,633
|
|
||||||
S14000033,Glasgow North West,634
|
|
||||||
S14000034,Glasgow South,635
|
|
||||||
S14000035,Glasgow South West,636
|
|
||||||
S14000036,Glenrothes,637
|
|
||||||
S14000037,Gordon,638
|
|
||||||
S14000038,Inverclyde,639
|
|
||||||
S14000039,"Inverness, Nairn, Badenoch and Strathspey",640
|
|
||||||
S14000040,Kilmarnock and Loudoun,641
|
|
||||||
S14000041,Kirkcaldy and Cowdenbeath,642
|
|
||||||
S14000042,Lanark and Hamilton East,643
|
|
||||||
S14000043,Linlithgow and East Falkirk,644
|
|
||||||
S14000044,Livingston,645
|
|
||||||
S14000045,Midlothian,646
|
|
||||||
S14000046,Moray,647
|
|
||||||
S14000047,Motherwell and Wishaw,648
|
|
||||||
S14000048,North Ayrshire and Arran,649
|
|
||||||
S14000049,North East Fife,650
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
requests
|
|
||||||
python-dotenv
|
|
||||||
pandas
|
|
||||||
tqdm
|
|
||||||
|
|
@ -1,26 +1,61 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.simulation_system.core.Settings import (
|
from etl.epc.settings import (
|
||||||
DATA_PROCESSOR_SETTINGS,
|
DATA_PROCESSOR_SETTINGS,
|
||||||
EARLIEST_EPC_DATE,
|
EARLIEST_EPC_DATE,
|
||||||
FULLY_GLAZED_DESCRIPTIONS,
|
FULLY_GLAZED_DESCRIPTIONS,
|
||||||
AVERAGE_FIXED_FEATURES,
|
AVERAGE_FIXED_FEATURES,
|
||||||
FLOOR_LEVEL_MAP,
|
|
||||||
BUILT_FORM_REMAP,
|
BUILT_FORM_REMAP,
|
||||||
COLUMNS_TO_MERGE_ON,
|
COLUMNS_TO_MERGE_ON,
|
||||||
COMPONENT_FEATURES,
|
|
||||||
FIXED_FEATURES,
|
FIXED_FEATURES,
|
||||||
COLUMNTYPES,
|
COLUMNTYPES,
|
||||||
RDSAP_RESPONSE,
|
RDSAP_RESPONSE,
|
||||||
MAX_SAP_SCORE,
|
MAX_SAP_SCORE,
|
||||||
fill_na_map,
|
fill_na_map,
|
||||||
FIXED_DESCRIPTON_MAPPED_FEATURES
|
STARTING_SUFFIX_COMPONENT_COLS,
|
||||||
|
NO_SUFFIX_COMPONENT_COLS,
|
||||||
|
ENDING_SUFFIX_COMPONENT_COLS
|
||||||
)
|
)
|
||||||
|
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
# These lookups are used to clean the construction age band
|
||||||
|
bounds_map = {
|
||||||
|
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||||
|
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||||
|
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||||
|
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||||
|
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||||
|
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||||
|
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||||
|
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||||
|
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||||
|
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||||
|
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||||
|
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||||
|
}
|
||||||
|
|
||||||
|
remap = {
|
||||||
|
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||||
|
}
|
||||||
|
|
||||||
|
expanded_map = {
|
||||||
|
i: [
|
||||||
|
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||||
|
][0] for i in range(0, 3001)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_int(x):
|
||||||
|
try:
|
||||||
|
int(x)
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class DataProcessor:
|
class DataProcessor:
|
||||||
"""
|
"""
|
||||||
|
|
@ -46,66 +81,36 @@ class DataProcessor:
|
||||||
def insert_data(self, data: pd.DataFrame) -> None:
|
def insert_data(self, data: pd.DataFrame) -> None:
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def clean_construction_age_band(x):
|
||||||
|
# Firstly, we check if it's an error value
|
||||||
|
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||||
|
return x
|
||||||
|
|
||||||
|
# Next, we check if it's a value in our map
|
||||||
|
if bounds_map.get(x):
|
||||||
|
return x
|
||||||
|
|
||||||
|
# We check if it's a standard remap value
|
||||||
|
remap_value = remap.get(x, None)
|
||||||
|
if remap_value:
|
||||||
|
return remap_value
|
||||||
|
|
||||||
|
# We check if it's a number
|
||||||
|
if is_int(x):
|
||||||
|
x_int = int(x)
|
||||||
|
return expanded_map[x_int]
|
||||||
|
|
||||||
|
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||||
|
|
||||||
def standardise_construction_age_band(self):
|
def standardise_construction_age_band(self):
|
||||||
"""
|
"""
|
||||||
This function will tidy up some of the non-standard values that are populated in the construction age
|
This function will tidy up some of the non-standard values that are populated in the construction age
|
||||||
band, which is useful for cleaning
|
band, which is useful for cleaning
|
||||||
"""
|
"""
|
||||||
bounds_map = {
|
|
||||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
|
||||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
|
||||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
|
||||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
|
||||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
|
||||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
|
||||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
|
||||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
|
||||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
|
||||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
|
||||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
|
||||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
|
||||||
}
|
|
||||||
|
|
||||||
remap = {
|
|
||||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
|
||||||
}
|
|
||||||
|
|
||||||
expanded_map = {
|
|
||||||
i: [
|
|
||||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
|
||||||
][0] for i in range(0, 3001)
|
|
||||||
}
|
|
||||||
|
|
||||||
def is_int(x):
|
|
||||||
try:
|
|
||||||
int(x)
|
|
||||||
return True
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def clean_construction_age_band(x):
|
|
||||||
# Firstly, we check if it's an error value
|
|
||||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
|
||||||
return x
|
|
||||||
|
|
||||||
# Next, we check if it's a value in our map
|
|
||||||
if bounds_map.get(x):
|
|
||||||
return x
|
|
||||||
|
|
||||||
# We check if it's a standard remap value
|
|
||||||
remap_value = remap.get(x, None)
|
|
||||||
if remap_value:
|
|
||||||
return remap_value
|
|
||||||
|
|
||||||
# We check if it's a number
|
|
||||||
if is_int(x):
|
|
||||||
x_int = int(x)
|
|
||||||
return expanded_map[x_int]
|
|
||||||
|
|
||||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
|
||||||
|
|
||||||
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
||||||
lambda x: clean_construction_age_band(x)
|
lambda x: self.clean_construction_age_band(x)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.data = self.data[
|
self.data = self.data[
|
||||||
|
|
@ -157,18 +162,6 @@ class DataProcessor:
|
||||||
break
|
break
|
||||||
to_index -= 1
|
to_index -= 1
|
||||||
|
|
||||||
def reformat_columns(self):
|
|
||||||
"""
|
|
||||||
This function applies the re-formattng of columns from lower case to capitalised
|
|
||||||
|
|
||||||
When requesting the epc data from the api, the columns are lower case
|
|
||||||
and separated by a hyphen, whereas in the bulk download, the columns
|
|
||||||
are capitalised and separated by underscores. If rename_columns is True
|
|
||||||
we convert the columns from lower case to capitalised format
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
|
|
||||||
|
|
||||||
def pre_process(self) -> pd.DataFrame:
|
def pre_process(self) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Load data and begin initial cleaning
|
Load data and begin initial cleaning
|
||||||
|
|
@ -176,22 +169,24 @@ class DataProcessor:
|
||||||
if self.data is None:
|
if self.data is None:
|
||||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||||
|
|
||||||
if self.newdata:
|
|
||||||
self.reformat_columns()
|
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.newdata:
|
||||||
self.confine_data()
|
self.confine_data()
|
||||||
|
|
||||||
self.remap_columns()
|
self.remap_columns()
|
||||||
|
|
||||||
# We have some non-standard construction age bands which we'll clean for matching
|
# We have some non-standard construction age bands which we'll clean for matching
|
||||||
self.standardise_construction_age_band()
|
if not self.newdata:
|
||||||
self.clean_missing_rooms()
|
self.standardise_construction_age_band()
|
||||||
|
|
||||||
|
self.clean_missing_rooms()
|
||||||
|
|
||||||
self.recast_df_columns(
|
self.recast_df_columns(
|
||||||
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
||||||
)
|
)
|
||||||
self.clean_multi_glaze_proportion()
|
|
||||||
|
if not self.newdata:
|
||||||
|
self.clean_multi_glaze_proportion()
|
||||||
|
|
||||||
self.clean_photo_supply()
|
self.clean_photo_supply()
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.newdata:
|
||||||
|
|
@ -203,16 +198,24 @@ class DataProcessor:
|
||||||
# If we have multiple EPC records, we can try and do filling
|
# If we have multiple EPC records, we can try and do filling
|
||||||
self.fill_na_fields()
|
self.fill_na_fields()
|
||||||
|
|
||||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
if not self.newdata:
|
||||||
|
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||||
|
|
||||||
# Final re-casting after data transformed and prepared
|
# Final re-casting after data transformed and prepared
|
||||||
self.data = self.data.astype(COLUMNTYPES)
|
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
|
||||||
|
self.data = self.data.astype(coltypes)
|
||||||
|
|
||||||
self.na_remapping()
|
self.na_remapping()
|
||||||
|
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
def na_remapping(self):
|
def na_remapping(self):
|
||||||
for column, fill_value in fill_na_map.items():
|
|
||||||
|
fill_na_map_apply = {
|
||||||
|
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
||||||
|
} if self.newdata else fill_na_map
|
||||||
|
|
||||||
|
for column, fill_value in fill_na_map_apply.items():
|
||||||
self.data[column] = self.data[column].fillna(fill_value)
|
self.data[column] = self.data[column].fillna(fill_value)
|
||||||
|
|
||||||
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
|
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
|
||||||
|
|
@ -255,7 +258,8 @@ class DataProcessor:
|
||||||
data = data.replace(np.NAN, None)
|
data = data.replace(np.NAN, None)
|
||||||
|
|
||||||
# Remap certain columns
|
# Remap certain columns
|
||||||
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
if not self.newdata:
|
||||||
|
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
||||||
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
||||||
|
|
||||||
convert_to_lower = ["TRANSACTION_TYPE"]
|
convert_to_lower = ["TRANSACTION_TYPE"]
|
||||||
|
|
@ -348,7 +352,7 @@ class DataProcessor:
|
||||||
|
|
||||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
||||||
|
|
||||||
# If there still is na values, use average across all properties in consituecy
|
# If there still is na values, use average across all epc in consituecy
|
||||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||||
variable
|
variable
|
||||||
].fillna(cleaning_averages_filled[variable].mean())
|
].fillna(cleaning_averages_filled[variable].mean())
|
||||||
|
|
@ -497,9 +501,15 @@ class DataProcessor:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if suffix not in ["_STARTING", "_ENDING"]:
|
if suffix not in ["_STARTING", "_ENDING"]:
|
||||||
raise Exception("Suffix should be one of _STARTING or _ENFING")
|
raise Exception("Suffix should be one of _STARTING or _ENDING")
|
||||||
|
|
||||||
return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
|
if suffix == "_STARTING":
|
||||||
|
starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
|
||||||
|
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy()
|
||||||
|
|
||||||
|
return pd.concat([starting_cols, fixed_cols], axis=1)
|
||||||
|
|
||||||
|
return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
|
||||||
|
|
||||||
def get_fixed_features(self) -> pd.DataFrame:
|
def get_fixed_features(self) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
|
@ -529,125 +539,33 @@ class DataProcessor:
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@classmethod
|
@staticmethod
|
||||||
def difference_data(cls, df: pd.DataFrame):
|
def calculate_days_to(lodgement_date):
|
||||||
|
|
||||||
"""
|
if isinstance(lodgement_date, str):
|
||||||
Given a dataframe and starting and ending columns, this function will convert the features to
|
return (
|
||||||
differenced the ending subtract the starting value, which is useful for modelling the difference responces
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||||
"""
|
).days
|
||||||
|
|
||||||
# We ensure that the u value columns are co-erced to a numerical format
|
return (
|
||||||
uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col]
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||||
for uvalue_col in uvalue_columns:
|
).dt.days
|
||||||
df[uvalue_col] = pd.to_numeric(df[uvalue_col])
|
|
||||||
|
|
||||||
key_columns = [
|
@staticmethod
|
||||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE",
|
def clean_missings_after_description_process(df, ignore_cols=None):
|
||||||
"SAP_STARTING", "HEAT_DEMAND_STARTING",
|
missings = pd.isnull(df).sum()
|
||||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
missings = missings[missings > 0]
|
||||||
"SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING",
|
|
||||||
"DAYS_TO_STARTING", "DAYS_TO_ENDING"
|
|
||||||
]
|
|
||||||
|
|
||||||
ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
|
if ignore_cols:
|
||||||
|
missings = missings[~missings.index.isin(ignore_cols)]
|
||||||
|
|
||||||
columns = {x for x in df.columns if x not in ignore_cols}
|
for col in missings.index:
|
||||||
|
unique_values = df[col].unique()
|
||||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
if True in unique_values or False in unique_values:
|
||||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
df[col] = df[col].fillna(False)
|
||||||
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
|
if "none" in unique_values:
|
||||||
|
df[col] = df[col].fillna("none")
|
||||||
df = pd.get_dummies(df, columns=non_numerical_columns)
|
|
||||||
|
|
||||||
# We make sure there is a starting and ending version of the column
|
|
||||||
diff_columns = []
|
|
||||||
no_diff_columns = [] # Store for debugging
|
|
||||||
for col in columns:
|
|
||||||
if "_ENDING" in col:
|
|
||||||
# Don't keep the endings
|
|
||||||
continue
|
|
||||||
else:
|
else:
|
||||||
# We have a starting column so check if we have an ending
|
df[col] = df[col].fillna("Unknown")
|
||||||
if col.replace("_STARTING", "") + "_ENDING" in columns:
|
|
||||||
diff_columns.append(col)
|
|
||||||
else:
|
|
||||||
no_diff_columns.append(col)
|
|
||||||
|
|
||||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
|
||||||
raise Exception("Something went wrong, potentially missed a differencing column")
|
|
||||||
|
|
||||||
datatypes = df.dtypes
|
|
||||||
|
|
||||||
# Note: We also difference columns like floor area and floor height. We should experiement with this.
|
|
||||||
# Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
|
|
||||||
# the starting value, therefore to explain any differences in the new floor area, it may be enough to
|
|
||||||
# just consider the difference however we can play around with this.
|
|
||||||
|
|
||||||
# Do the differencing
|
|
||||||
cols_to_append = {}
|
|
||||||
for starting_col in diff_columns:
|
|
||||||
|
|
||||||
base_col = starting_col.replace("_STARTING", "")
|
|
||||||
|
|
||||||
if "_STARTING" in starting_col:
|
|
||||||
ending_col = starting_col.replace("_STARTING", "_ENDING")
|
|
||||||
else:
|
|
||||||
ending_col = starting_col + "_ENDING"
|
|
||||||
|
|
||||||
if starting_col not in non_numerical_columns:
|
|
||||||
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
|
|
||||||
df = df.drop(columns=[starting_col, ending_col])
|
|
||||||
continue
|
|
||||||
|
|
||||||
level_values = list(set(levels[starting_col] + levels[ending_col]))
|
|
||||||
|
|
||||||
level_cols = []
|
|
||||||
for level in level_values:
|
|
||||||
starting_level_col = "_".join([starting_col, str(level)])
|
|
||||||
ending_level_col = "_".join([ending_col, str(level)])
|
|
||||||
|
|
||||||
if starting_level_col not in df.columns:
|
|
||||||
# We have no starting, just ending
|
|
||||||
col_type = datatypes[ending_level_col].name
|
|
||||||
|
|
||||||
if col_type == "bool":
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
|
|
||||||
else:
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
|
|
||||||
|
|
||||||
level_cols.append(ending_level_col)
|
|
||||||
|
|
||||||
elif ending_level_col not in df.columns:
|
|
||||||
# We have no ending, just starting
|
|
||||||
col_type = datatypes[starting_level_col].name
|
|
||||||
|
|
||||||
if col_type == "bool":
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
|
|
||||||
else:
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
|
|
||||||
|
|
||||||
level_cols.append(starting_level_col)
|
|
||||||
|
|
||||||
else:
|
|
||||||
col_type = datatypes[starting_level_col].name
|
|
||||||
|
|
||||||
if col_type == "bool":
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = (
|
|
||||||
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
|
|
||||||
|
|
||||||
level_cols.extend([starting_level_col, ending_level_col])
|
|
||||||
|
|
||||||
# Drop the columns
|
|
||||||
df = df.drop(columns=level_cols)
|
|
||||||
|
|
||||||
cols_to_append = pd.DataFrame(cols_to_append)
|
|
||||||
df = pd.concat([df, cols_to_append], axis=1)
|
|
||||||
|
|
||||||
# Perform a final coercing of string True/False columns to boolean
|
|
||||||
df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
@ -4,25 +4,24 @@ from tqdm import tqdm
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from model_data.simulation_system.core.Settings import (
|
from etl.epc.settings import (
|
||||||
MANDATORY_FIXED_FEATURES,
|
MANDATORY_FIXED_FEATURES,
|
||||||
LATEST_FIELD,
|
LATEST_FIELD,
|
||||||
COMPONENT_FEATURES,
|
COMPONENT_FEATURES,
|
||||||
RDSAP_RESPONSE,
|
RDSAP_RESPONSE,
|
||||||
HEAT_DEMAND_RESPONSE,
|
HEAT_DEMAND_RESPONSE,
|
||||||
COLUMNS_TO_MERGE_ON,
|
COLUMNS_TO_MERGE_ON,
|
||||||
EARLIEST_EPC_DATE,
|
|
||||||
CARBON_RESPONSE,
|
CARBON_RESPONSE,
|
||||||
)
|
)
|
||||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
from etl.epc.DataProcessor import DataProcessor
|
||||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
|
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||||
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
from recommendations.rdsap_tables import england_wales_age_band_lookup
|
||||||
from recommendations.recommendation_utils import (
|
from recommendations.recommendation_utils import (
|
||||||
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
|
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
|
||||||
get_wall_type
|
get_wall_type
|
||||||
)
|
)
|
||||||
|
|
||||||
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||||
|
|
||||||
|
|
||||||
def get_cleaned():
|
def get_cleaned():
|
||||||
|
|
@ -364,21 +363,6 @@ def make_uvalues(df):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def clean_missings_after_description_process(df):
|
|
||||||
missings = pd.isnull(df).sum()
|
|
||||||
missings = missings[missings > 0]
|
|
||||||
for col in missings.index:
|
|
||||||
unique_values = df[col].unique()
|
|
||||||
if True in unique_values or False in unique_values:
|
|
||||||
df[col] = df[col].fillna(False)
|
|
||||||
if "none" in unique_values:
|
|
||||||
df[col] = df[col].fillna("none")
|
|
||||||
else:
|
|
||||||
df[col] = df[col].fillna("Unknown")
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def app():
|
def app():
|
||||||
# Get all the files in the directory
|
# Get all the files in the directory
|
||||||
|
|
||||||
|
|
@ -400,6 +384,8 @@ def app():
|
||||||
data_processor = DataProcessor(filepath=filepath)
|
data_processor = DataProcessor(filepath=filepath)
|
||||||
|
|
||||||
df = data_processor.pre_process()
|
df = data_processor.pre_process()
|
||||||
|
df[df["WALLS_DESCRIPTION"].str.contains("Cavity")]["WALLS_DESCRIPTION"].unique()
|
||||||
|
|
||||||
cleaning_averages = data_processor.make_cleaning_averages()
|
cleaning_averages = data_processor.make_cleaning_averages()
|
||||||
|
|
||||||
# We have some odd cases with missing constituency so we fill
|
# We have some odd cases with missing constituency so we fill
|
||||||
|
|
@ -512,12 +498,11 @@ def app():
|
||||||
|
|
||||||
# Add some temporal features - we look at the days from the standard starting point in time
|
# Add some temporal features - we look at the days from the standard starting point in time
|
||||||
# for the starting and ending date so all records are from a fixed point
|
# for the starting and ending date so all records are from a fixed point
|
||||||
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
data_by_urpn_df["LODGEMENT_DATE_STARTING"])
|
||||||
).dt.days
|
|
||||||
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
|
||||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
data_by_urpn_df["LODGEMENT_DATE_ENDING"])
|
||||||
).dt.days
|
|
||||||
|
|
||||||
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||||
|
|
||||||
|
|
@ -544,7 +529,7 @@ def app():
|
||||||
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
|
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
|
||||||
# need to
|
# need to
|
||||||
|
|
||||||
data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
|
data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
|
||||||
|
|
||||||
if pd.isnull(data_by_urpn_df).sum().sum():
|
if pd.isnull(data_by_urpn_df).sum().sum():
|
||||||
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
||||||
|
|
@ -564,6 +549,12 @@ def app():
|
||||||
|
|
||||||
output = pd.concat(dataset)
|
output = pd.concat(dataset)
|
||||||
|
|
||||||
|
# Remove any records that have huge swings in their floor area
|
||||||
|
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
|
||||||
|
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
|
||||||
|
output = output[output["tfa_diff_prop"] < 0.5]
|
||||||
|
output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
|
||||||
|
|
||||||
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
|
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
|
||||||
for uvalue_col in uvalue_columns:
|
for uvalue_col in uvalue_columns:
|
||||||
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
|
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
|
||||||
|
|
@ -571,15 +562,7 @@ def app():
|
||||||
save_dataframe_to_s3_parquet(
|
save_dataframe_to_s3_parquet(
|
||||||
df=output,
|
df=output,
|
||||||
bucket_name="retrofit-data-dev",
|
bucket_name="retrofit-data-dev",
|
||||||
file_key="sap_change_model/dataset_without_differencing.parquet",
|
file_key="sap_change_model/dataset.parquet",
|
||||||
)
|
|
||||||
|
|
||||||
output = DataProcessor.difference_data(output)
|
|
||||||
|
|
||||||
save_dataframe_to_s3_parquet(
|
|
||||||
df=output,
|
|
||||||
bucket_name="retrofit-data-dev",
|
|
||||||
file_key="sap_change_model/dataset_with_differencing.parquet",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -133,28 +133,6 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||||
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
||||||
|
|
||||||
|
|
||||||
def ordinal(n):
|
|
||||||
if 10 <= n % 100 <= 20:
|
|
||||||
suffix = "th"
|
|
||||||
else:
|
|
||||||
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
|
|
||||||
|
|
||||||
return str(n) + suffix
|
|
||||||
|
|
||||||
|
|
||||||
FLOOR_LEVEL_MAP = {
|
|
||||||
"Basement": -1,
|
|
||||||
"Ground": 0,
|
|
||||||
"ground floor": 0,
|
|
||||||
"20+": 20,
|
|
||||||
"21st or above": 21,
|
|
||||||
**{str(i).zfill(2): i for i in range(0, 21)},
|
|
||||||
**{ordinal(i): i for i in range(-1, 21)},
|
|
||||||
**{str(i): i for i in range(-1, 21)},
|
|
||||||
**{i: i for i in range(-1, 21)},
|
|
||||||
}
|
|
||||||
|
|
||||||
BUILT_FORM_REMAP = {
|
BUILT_FORM_REMAP = {
|
||||||
"Enclosed End-Terrace": "End-Terrace",
|
"Enclosed End-Terrace": "End-Terrace",
|
||||||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||||||
|
|
@ -212,10 +190,66 @@ fill_na_map = {
|
||||||
"NUMBER_OPEN_FIREPLACES": 0
|
"NUMBER_OPEN_FIREPLACES": 0
|
||||||
}
|
}
|
||||||
|
|
||||||
# After the property descriptions have been re-remapped, we expect these features to be fixed
|
################################################################################################
|
||||||
FIXED_DESCRIPTON_MAPPED_FEATURES = [
|
# These are the features we need for scoring
|
||||||
'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
|
# We'll likely change how we do this in the future
|
||||||
'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
|
################################################################################################
|
||||||
'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
|
|
||||||
'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
|
STARTING_SUFFIX_COMPONENT_COLS = [
|
||||||
|
"SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
|
||||||
|
"SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
|
||||||
|
"GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
|
||||||
|
"EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
|
||||||
|
]
|
||||||
|
NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
|
||||||
|
'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||||
|
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
|
||||||
|
'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
|
||||||
|
'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
|
||||||
|
'is_solid', 'another_property_below', 'floor_insulation_thickness',
|
||||||
|
'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
|
||||||
|
'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
|
||||||
|
'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
|
||||||
|
'energy_recovery',
|
||||||
|
'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
|
||||||
|
'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
|
||||||
|
'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
|
||||||
|
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
|
||||||
|
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
|
||||||
|
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
|
||||||
|
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
|
||||||
|
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
|
||||||
|
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
|
||||||
|
'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
|
||||||
|
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
|
||||||
|
'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
|
||||||
|
'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
|
||||||
|
'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
|
||||||
|
'rate_control',
|
||||||
|
'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
|
||||||
|
'no_individual_heating_or_community_network', 'complex_fuel_type',
|
||||||
|
]
|
||||||
|
|
||||||
|
ENDING_SUFFIX_COMPONENT_COLS = [
|
||||||
|
'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
|
||||||
|
'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
|
||||||
|
'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
|
||||||
|
'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
|
||||||
|
'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
|
||||||
|
'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
|
||||||
|
'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
|
||||||
|
'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
|
||||||
|
'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
|
||||||
|
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
|
||||||
|
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
|
||||||
|
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
|
||||||
|
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
|
||||||
|
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
|
||||||
|
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
|
||||||
|
'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
|
||||||
|
'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
|
||||||
|
'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
|
||||||
|
'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
|
||||||
|
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
|
||||||
|
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
|
||||||
]
|
]
|
||||||
|
|
@ -4,16 +4,16 @@ from collections import defaultdict
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from model_data.utils import correct_spelling
|
from etl.epc_clean.utils import correct_spelling
|
||||||
from model_data.epc_attributes.FloorAttributes import FloorAttributes
|
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||||
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||||
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
|
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||||
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||||
from model_data.epc_attributes.RoofAttributes import RoofAttributes
|
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||||
from model_data.epc_attributes.WallAttributes import WallAttributes
|
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||||
from model_data.epc_attributes.WindowAttributes import WindowAttributes
|
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||||
from model_data.epc_attributes.LightingAttributes import LightingAttributes
|
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||||
|
|
||||||
|
|
||||||
class EpcClean:
|
class EpcClean:
|
||||||
|
|
@ -130,7 +130,7 @@ class EpcClean:
|
||||||
self.cleaned[field].append(
|
self.cleaned[field].append(
|
||||||
{
|
{
|
||||||
"original_description": description,
|
"original_description": description,
|
||||||
"clean_description": cln.description.capitalize(),
|
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
|
||||||
**cln.process()
|
**cln.process()
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
@ -3,8 +3,8 @@ import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from model_data.EpcClean import EpcClean
|
from etl.epc_clean.EpcClean import EpcClean
|
||||||
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from utils.s3 import save_data_to_s3
|
from utils.s3 import save_data_to_s3
|
||||||
|
|
||||||
|
|
@ -19,7 +19,7 @@ LAND_REGISTRY_PATHS = [
|
||||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
||||||
]
|
]
|
||||||
|
|
||||||
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
EPC_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||||
|
|
||||||
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
||||||
|
|
||||||
|
|
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
||||||
def app():
|
def app():
|
||||||
"""
|
"""
|
||||||
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
||||||
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
|
and produce a dataset of cleaned fields so that when we get new epc, we can quickly
|
||||||
sanitise any description data
|
sanitise any description data
|
||||||
|
|
||||||
Currently, this application is just run on a local machine
|
Currently, this application is just run on a local machine
|
||||||
|
|
@ -36,9 +36,6 @@ def app():
|
||||||
cleaned_data = {}
|
cleaned_data = {}
|
||||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||||
for directory in tqdm(epc_directories):
|
for directory in tqdm(epc_directories):
|
||||||
directory_destructured = str(directory).split("/")[-1].split("-")
|
|
||||||
gss_code = directory_destructured[1]
|
|
||||||
local_authority = directory_destructured[2]
|
|
||||||
|
|
||||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||||
# Rename the columns to the same format as the api returns
|
# Rename the columns to the same format as the api returns
|
||||||
|
|
@ -62,14 +59,6 @@ def app():
|
||||||
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
|
||||||
cleaned_data[k].extend(new_data)
|
cleaned_data[k].extend(new_data)
|
||||||
|
|
||||||
# TODO: Add property age band into this
|
|
||||||
# uvalue_estimates = UvalueEstimations(data=data)
|
|
||||||
# uvalue_estimates.get_estimates(cleaner=cleaner)
|
|
||||||
# # TODO: Store these to a s3
|
|
||||||
# uvalue_estimates.walls
|
|
||||||
# uvalue_estimates.floors
|
|
||||||
# uvalue_estimates.roofs
|
|
||||||
|
|
||||||
# Basic check to make sure all descriptions are unique
|
# Basic check to make sure all descriptions are unique
|
||||||
for _, cleaned in cleaned_data.items():
|
for _, cleaned in cleaned_data.items():
|
||||||
descriptions = [x["original_description"] for x in cleaned]
|
descriptions = [x["original_description"] for x in cleaned]
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
|
from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
|
||||||
|
|
||||||
|
|
||||||
class FloorAttributes(Definitions):
|
class FloorAttributes(Definitions):
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||||
|
|
||||||
|
|
||||||
class HotWaterAttributes(Definitions):
|
class HotWaterAttributes(Definitions):
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import re
|
import re
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||||
from model_data.utils import correct_spelling
|
from etl.epc_clean.utils import correct_spelling
|
||||||
|
|
||||||
|
|
||||||
class LightingAttributes:
|
class LightingAttributes:
|
||||||
|
|
@ -27,7 +27,7 @@ class LightingAttributes:
|
||||||
lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)
|
lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)
|
||||||
|
|
||||||
if lel_match is not None or lel_match2 is not None:
|
if lel_match is not None or lel_match2 is not None:
|
||||||
|
|
||||||
# Perform the actual translation
|
# Perform the actual translation
|
||||||
percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
|
percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
|
||||||
self.description = f"low energy lighting in {percentage}% of fixed outlets"
|
self.description = f"low energy lighting in {percentage}% of fixed outlets"
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
|
||||||
|
|
||||||
|
|
||||||
class MainFuelAttributes(Definitions):
|
class MainFuelAttributes(Definitions):
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
|
||||||
|
|
||||||
|
|
||||||
class MainheatControlAttributes(Definitions):
|
class MainheatControlAttributes(Definitions):
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
|
from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
|
||||||
|
|
||||||
|
|
||||||
class RoofAttributes(Definitions):
|
class RoofAttributes(Definitions):
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import (
|
from etl.epc_clean.epc_attributes.attribute_utils import (
|
||||||
extract_component_types,
|
extract_component_types,
|
||||||
extract_thermal_transmittance
|
extract_thermal_transmittance
|
||||||
)
|
)
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Union
|
from typing import Dict, Union
|
||||||
from model_data.BaseUtility import Definitions
|
from BaseUtility import Definitions
|
||||||
from model_data.epc_attributes.attribute_utils import clean_description
|
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||||
|
|
||||||
|
|
||||||
class WindowAttributes(Definitions):
|
class WindowAttributes(Definitions):
|
||||||
21
etl/epc_clean/epc_attributes/all_cleaners.py
Normal file
21
etl/epc_clean/epc_attributes/all_cleaners.py
Normal file
|
|
@ -0,0 +1,21 @@
|
||||||
|
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||||
|
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||||
|
|
||||||
|
all_cleaner_map = {
|
||||||
|
'floor-description': FloorAttributes,
|
||||||
|
'hotwater-description': HotWaterAttributes,
|
||||||
|
'main-fuel': MainFuelAttributes,
|
||||||
|
'mainheat-description': MainHeatAttributes,
|
||||||
|
'mainheatcont-description': MainheatControlAttributes,
|
||||||
|
'roof-description': RoofAttributes,
|
||||||
|
'walls-description': WallAttributes,
|
||||||
|
'windows-description': WindowAttributes,
|
||||||
|
'lighting-description:': LightingAttributes,
|
||||||
|
}
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
import model_data.epc_attributes.attribute_utils as attribute_utils
|
import etl.epc_clean.epc_attributes.attribute_utils as attribute_utils
|
||||||
|
|
||||||
|
|
||||||
def test_extract_thermal_transmittance():
|
def test_extract_thermal_transmittance():
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
import pickle
|
import pickle
|
||||||
from model_data.EpcClean import EpcClean
|
from etl.epc_clean.EpcClean import EpcClean
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# For local testing
|
# For local testing
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.tests.test_data.test_floor_attributes_cases import clean_floor_cases
|
from etl.epc_clean.tests.test_data.test_floor_attributes_cases import clean_floor_cases
|
||||||
from model_data.epc_attributes.FloorAttributes import FloorAttributes
|
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||||
|
|
||||||
|
|
||||||
class TestCleanFloor:
|
class TestCleanFloor:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||||
from model_data.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
|
from etl.epc_clean.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
|
||||||
|
|
||||||
|
|
||||||
class TestHotWaterAttributes:
|
class TestHotWaterAttributes:
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.tests.test_data.test_lighting_attributes_cases import test_cases
|
from etl.epc_clean.tests.test_data.test_lighting_attributes_cases import test_cases
|
||||||
from model_data.epc_attributes.LightingAttributes import LightingAttributes
|
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||||
|
|
||||||
# An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
|
# An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
|
||||||
# value is the expected proportion.
|
# value is the expected proportion.
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||||
from model_data.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
|
from etl.epc_clean.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
|
||||||
|
|
||||||
|
|
||||||
class TestMainHeatControlAttributes:
|
class TestMainHeatControlAttributes:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
|
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||||
from model_data.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
|
from etl.epc_clean.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
|
||||||
|
|
||||||
|
|
||||||
class TestMainHeatAttributes:
|
class TestMainHeatAttributes:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||||
from model_data.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
|
from etl.epc_clean.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
|
||||||
|
|
||||||
|
|
||||||
class TestMainHeatControlAttributes:
|
class TestMainHeatControlAttributes:
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from model_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
|
from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
|
||||||
from model_data.epc_attributes.RoofAttributes import RoofAttributes
|
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||||
|
|
||||||
# For local testing
|
# For local testing
|
||||||
if __file__ == "<input>":
|
if __file__ == "<input>":
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
from model_data.utils import is_percentage_or_number, correct_spelling
|
from etl.epc_clean.utils import is_percentage_or_number, correct_spelling
|
||||||
|
|
||||||
|
|
||||||
class TestUtils:
|
class TestUtils:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.WallAttributes import WallAttributes
|
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||||
from model_data.tests.test_data.test_wall_attributes_cases import wall_cases
|
from etl.epc_clean.tests.test_data.test_wall_attributes_cases import wall_cases
|
||||||
|
|
||||||
|
|
||||||
class TestWallAttributes:
|
class TestWallAttributes:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.epc_attributes.WindowAttributes import WindowAttributes
|
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||||
from model_data.tests.test_data.test_window_attributes_cases import windows_cases
|
from etl.epc_clean.tests.test_data.test_window_attributes_cases import windows_cases
|
||||||
|
|
||||||
|
|
||||||
class TestWindowAttributes:
|
class TestWindowAttributes:
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from unittest.mock import patch, call
|
from unittest.mock import patch, call
|
||||||
from model_data.LandRegistryClient import LandRegistryClient
|
from etl.land_registry.LandRegistryClient import LandRegistryClient
|
||||||
|
|
||||||
|
|
||||||
class TestLandRegistryClient:
|
class TestLandRegistryClient:
|
||||||
54
etl/property_dimensions/app.py
Normal file
54
etl/property_dimensions/app.py
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
"""
|
||||||
|
This is a simple application which estimates some of the basic dimensions of a property based on EPC
|
||||||
|
data which we can use as a proxy value if we don't have this information on the EPC
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
from tqdm import tqdm
|
||||||
|
from etl.epc.settings import EARLIEST_EPC_DATE
|
||||||
|
from etl.epc.DataProcessor import DataProcessor
|
||||||
|
from BaseUtility import Definitions
|
||||||
|
from utils.s3 import save_dataframe_to_s3_parquet
|
||||||
|
|
||||||
|
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
||||||
|
|
||||||
|
GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
|
||||||
|
|
||||||
|
BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
|
||||||
|
|
||||||
|
|
||||||
|
def app():
|
||||||
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||||
|
|
||||||
|
for directory in tqdm(directories):
|
||||||
|
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||||
|
data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||||
|
data = data[~pd.isnull(data["UPRN"])]
|
||||||
|
data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
|
||||||
|
|
||||||
|
data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
|
||||||
|
lambda x: DataProcessor.clean_construction_age_band(x)
|
||||||
|
)
|
||||||
|
data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
|
||||||
|
data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
|
||||||
|
data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
|
||||||
|
data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
|
||||||
|
data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
|
||||||
|
|
||||||
|
df = (
|
||||||
|
data.groupby(GROUPBY)
|
||||||
|
.agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"})
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
local_authority = data["LOCAL_AUTHORITY"].unique()
|
||||||
|
if len(local_authority) > 1:
|
||||||
|
raise Exception("More than one la in data")
|
||||||
|
local_authority = local_authority[0]
|
||||||
|
|
||||||
|
save_dataframe_to_s3_parquet(
|
||||||
|
df=df,
|
||||||
|
bucket_name=BUCKET,
|
||||||
|
file_key=f"property_dimensions/{local_authority}.parquet",
|
||||||
|
)
|
||||||
|
|
@ -56,7 +56,7 @@ class BoreholeClient:
|
||||||
|
|
||||||
# EXAMPLE
|
# EXAMPLE
|
||||||
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
|
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
|
||||||
# entries in here if possible before we produce any form of comparison between our properties, to infer
|
# entries in here if possible before we produce any form of comparison between our epc, to infer
|
||||||
# the distance from the property to the nearest borehole
|
# the distance from the property to the nearest borehole
|
||||||
|
|
||||||
# Let's take a sample
|
# Let's take a sample
|
||||||
|
|
@ -1,12 +1,55 @@
|
||||||
from enum import Enum
|
import boto3
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
import geopandas as gpd
|
import geopandas as gpd
|
||||||
|
import numpy as np
|
||||||
|
from enum import Enum
|
||||||
from shapely.geometry import Point
|
from shapely.geometry import Point
|
||||||
from utils.logger import setup_logger
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import read_io_from_s3
|
||||||
from datatypes.datatypes import OpenUprnCoordinateData
|
from datatypes.datatypes import OpenUprnCoordinateData
|
||||||
|
|
||||||
logger = setup_logger()
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def read_shapefile_from_s3(bucket_name, s3_file_key):
|
||||||
|
"""
|
||||||
|
Read a shapefile from S3 into a GeoDataFrame.
|
||||||
|
|
||||||
|
:param bucket_name: The name of the S3 bucket
|
||||||
|
:param s3_file_key: The file path of the shape file
|
||||||
|
:return: GeoDataFrame containing the shapefile data
|
||||||
|
"""
|
||||||
|
|
||||||
|
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
|
||||||
|
shape_file_key = s3_file_key.split("/")[-1]
|
||||||
|
# Create a temporary directory
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||||
|
s3_client = boto3.client('s3')
|
||||||
|
|
||||||
|
# Ensure the temporary directory exists
|
||||||
|
logger.info("Creating temporary directory at %s" % tmpdirname)
|
||||||
|
os.makedirs(tmpdirname, exist_ok=True)
|
||||||
|
|
||||||
|
# List all files in the given S3 folder
|
||||||
|
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
|
||||||
|
|
||||||
|
# Download each file to the temporary directory
|
||||||
|
for s3_object in s3_objects:
|
||||||
|
file_key = s3_object['Key']
|
||||||
|
file_name = os.path.basename(file_key)
|
||||||
|
local_file_path = os.path.join(tmpdirname, file_name)
|
||||||
|
# Explicitly create the temporary file
|
||||||
|
with open(local_file_path, 'wb') as tmpfile:
|
||||||
|
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
|
||||||
|
|
||||||
|
# Read the shapefile from the temporary directory into a GeoDataFrame
|
||||||
|
shapefile_path = os.path.join(tmpdirname, shape_file_key)
|
||||||
|
gdf = gpd.read_file(shapefile_path)
|
||||||
|
|
||||||
|
return gdf
|
||||||
|
|
||||||
|
|
||||||
class ConservationAreaClient:
|
class ConservationAreaClient:
|
||||||
"""
|
"""
|
||||||
Class to interact and manupulate convervation area data. The historic england data
|
Class to interact and manupulate convervation area data. The historic england data
|
||||||
|
|
@ -18,13 +61,14 @@ class ConservationAreaClient:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
SOURCES = ["historic_england"]
|
SOURCES = ["historic_england"]
|
||||||
IN_CONSERVATION_AREA = "in_conservation_area"
|
IN_CONSERVATION_AREA = True
|
||||||
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
|
NOT_IN_CONSERVATION_AREA = False
|
||||||
UNKNOWN = "unknown"
|
UNKNOWN = None
|
||||||
|
|
||||||
def __init__(self, historic_england_path, gov_path):
|
def __init__(self, historic_england_path, gov_path, bucket):
|
||||||
self.historic_england_path = historic_england_path
|
self.historic_england_path = historic_england_path
|
||||||
self.gov_path = gov_path
|
self.gov_path = gov_path
|
||||||
|
self.bucket = bucket
|
||||||
|
|
||||||
self.historic_england_data = None
|
self.historic_england_data = None
|
||||||
self.gov_data = None
|
self.gov_data = None
|
||||||
|
|
@ -34,11 +78,21 @@ class ConservationAreaClient:
|
||||||
Read the data
|
Read the data
|
||||||
"""
|
"""
|
||||||
logger.info("Reading in historic england conservation area shapefile")
|
logger.info("Reading in historic england conservation area shapefile")
|
||||||
self.historic_england_data = gpd.read_file(self.historic_england_path)
|
self.historic_england_data = read_shapefile_from_s3(
|
||||||
|
bucket_name=self.bucket, s3_file_key=self.historic_england_path
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("Reading in Govenment conservation area geojson")
|
logger.info("Reading in Govenment conservation area geojson")
|
||||||
self.gov_data = gpd.read_file(self.gov_path)
|
|
||||||
|
self.gov_data = gpd.read_file(
|
||||||
|
read_io_from_s3(
|
||||||
|
bucket_name=self.bucket,
|
||||||
|
file_key=self.gov_path
|
||||||
|
)
|
||||||
|
)
|
||||||
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
self.gov_data = self.gov_data.drop(columns=["dataset"])
|
||||||
|
# Convert the gov data to british national grid co-ordinates
|
||||||
|
self.gov_data = self.gov_data.to_crs("EPSG:27700")
|
||||||
|
|
||||||
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
|
||||||
|
|
||||||
|
|
@ -71,6 +125,43 @@ class ConservationAreaClient:
|
||||||
else:
|
else:
|
||||||
return ConservationAreaClient.UNKNOWN
|
return ConservationAreaClient.UNKNOWN
|
||||||
|
|
||||||
|
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||||
|
|
||||||
|
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
|
||||||
|
|
||||||
|
# Identify where we have definitive information (not "unknown")
|
||||||
|
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
|
||||||
|
joined_gdf_he["NAME"] != "No data available for publication by HE"
|
||||||
|
)
|
||||||
|
|
||||||
|
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
|
||||||
|
# The right index will be missing when we don't have a match so the uprn is not in a conservation
|
||||||
|
# area
|
||||||
|
uprn_not_in_conservation_he = joined_gdf_he.loc[
|
||||||
|
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
|
||||||
|
"UPRN"
|
||||||
|
].unique()
|
||||||
|
|
||||||
|
# For unknowns, check against government data
|
||||||
|
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
|
||||||
|
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
|
||||||
|
|
||||||
|
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
|
||||||
|
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
|
||||||
|
|
||||||
|
uprn_gdf['conservation_status'] = self.UNKNOWN
|
||||||
|
uprn_gdf.loc[
|
||||||
|
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
|
||||||
|
] = self.IN_CONSERVATION_AREA
|
||||||
|
uprn_gdf.loc[
|
||||||
|
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
|
||||||
|
] = self.NOT_IN_CONSERVATION_AREA
|
||||||
|
uprn_gdf.loc[
|
||||||
|
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
|
||||||
|
] = self.IN_CONSERVATION_AREA
|
||||||
|
|
||||||
|
return uprn_gdf
|
||||||
|
|
||||||
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
|
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
|
||||||
"""
|
"""
|
||||||
Check if a property is in a conservation area
|
Check if a property is in a conservation area
|
||||||
118
etl/spatial/OpenUprnClient.py
Normal file
118
etl/spatial/OpenUprnClient.py
Normal file
|
|
@ -0,0 +1,118 @@
|
||||||
|
import os
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
import geopandas as gpd
|
||||||
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class OpenUprnClient:
|
||||||
|
"""
|
||||||
|
|
||||||
|
This client reads in the Open UPRN data from s3 which can be downloaded from here:
|
||||||
|
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||||
|
|
||||||
|
This dataset contains a lookup of UPRNs to coordinates.
|
||||||
|
|
||||||
|
Specs for this dataset can be found here:
|
||||||
|
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path, bucket, uprns=None):
|
||||||
|
self.path = path
|
||||||
|
self.bucket = bucket
|
||||||
|
self.uprns = [int(x) for x in uprns] if uprns else None
|
||||||
|
self.data = None
|
||||||
|
|
||||||
|
# This will be stored in S3 and will be the complete list of filenames
|
||||||
|
# We'll then use this to determine which file the UPRN's data is contained in
|
||||||
|
self.filenames = None
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
"""
|
||||||
|
This methodology is placeholder, while data sits localls
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
logger.info("Reading in open uprn data")
|
||||||
|
|
||||||
|
df = pd.read_csv(
|
||||||
|
read_io_from_s3(
|
||||||
|
bucket_name=self.bucket,
|
||||||
|
file_key=self.path
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if self.uprns:
|
||||||
|
df = df[df["UPRN"].isin(self.uprns)]
|
||||||
|
|
||||||
|
self.data = df
|
||||||
|
|
||||||
|
def read_local(self):
|
||||||
|
"""
|
||||||
|
For local testing
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
logger.info("Reading in open uprn data")
|
||||||
|
|
||||||
|
df = pd.read_csv(self.path)
|
||||||
|
if self.uprns:
|
||||||
|
df = df[df["UPRN"].isin(self.uprns)]
|
||||||
|
|
||||||
|
self.data = df
|
||||||
|
|
||||||
|
def create_file_partitions(self, partition_size=50000):
|
||||||
|
logger.info("Sorting data by UPRN ascending")
|
||||||
|
self.data = self.data.sort_values("UPRN", ascending=True)
|
||||||
|
|
||||||
|
logger.info("Creating partitions")
|
||||||
|
self.data['partition'] = self.data.index // partition_size
|
||||||
|
|
||||||
|
self.filenames = {}
|
||||||
|
for partition, group in tqdm(self.data.groupby('partition')):
|
||||||
|
min_uprn = group['UPRN'].min()
|
||||||
|
max_uprn = group['UPRN'].max()
|
||||||
|
self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
|
||||||
|
|
||||||
|
self.data['filename'] = self.data['partition'].map(self.filenames)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find_filename_for_uprn(uprn, filenames):
|
||||||
|
for filename in filenames:
|
||||||
|
min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
|
||||||
|
if min_uprn <= uprn <= max_uprn:
|
||||||
|
return filename
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convert_bng_data_to_gpd(df):
|
||||||
|
|
||||||
|
gpd_data = gpd.GeoDataFrame(
|
||||||
|
df,
|
||||||
|
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
|
||||||
|
crs="EPSG:27700" # British National Grid
|
||||||
|
)
|
||||||
|
|
||||||
|
return gpd_data
|
||||||
|
|
||||||
|
def save_filenames_to_s3(self, bucket_name):
|
||||||
|
"""
|
||||||
|
Save the filenames to s3
|
||||||
|
:param bucket_name:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
file_key = os.path.join("spatial", "filename_meta.parquet")
|
||||||
|
|
||||||
|
filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
|
||||||
|
filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
|
||||||
|
'(\d+)_(\d+)'
|
||||||
|
)
|
||||||
|
filenames['lower'] = filenames['lower'].astype(int)
|
||||||
|
filenames['upper'] = filenames['upper'].astype(int)
|
||||||
|
|
||||||
|
logger.info("Saving filenames to s3 at {}".format(file_key))
|
||||||
|
save_dataframe_to_s3_parquet(
|
||||||
|
df=filenames,
|
||||||
|
file_key=file_key,
|
||||||
|
bucket_name=bucket_name
|
||||||
|
)
|
||||||
48
etl/spatial/README.md
Normal file
48
etl/spatial/README.md
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
# Spatial - Geospatial Data Processing Service
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The Spatial service is designed to read, process, and analyze geospatial data related to
|
||||||
|
conservation areas and special buildings. It uses datasets from Historic England and the
|
||||||
|
UK government to determine whether a given UPRN (Unique Property Reference Number) is within
|
||||||
|
a conservation area or is a listed building. The processed data is saved back to an S3 bucket
|
||||||
|
in a parquet format for easy retrieval and further analysis.
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
Dependencies are listed in requirements.txt. To install them, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Data Sources
|
||||||
|
|
||||||
|
1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
|
||||||
|
2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
|
||||||
|
3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
|
||||||
|
4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
|
||||||
|
5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
- app.py: Main application file that orchestrates the data processing flow.
|
||||||
|
- ConservationAreaClient.py: Handles reading and processing of conservation area data.
|
||||||
|
- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
|
||||||
|
- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
|
||||||
|
- requirements.txt: Lists all Python package dependencies.
|
||||||
|
|
||||||
|
## How to Run
|
||||||
|
|
||||||
|
1. Make sure you have all the required packages installed.
|
||||||
|
2. Update the S3 bucket and file path constants in app.py.
|
||||||
|
3. Run app.py.
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
1. Read the datasets for conservation areas and special buildings.
|
||||||
|
2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
|
||||||
|
3. For each partition:
|
||||||
|
- Convert UPRN data to geopandas DataFrame.
|
||||||
|
- Check if each UPRN is within a conservation area or is a special building.
|
||||||
|
- Save the processed data back to S3 in parquet format.
|
||||||
114
etl/spatial/SpecialBuildingsClient.py
Normal file
114
etl/spatial/SpecialBuildingsClient.py
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
import geopandas as gpd
|
||||||
|
from shapely.geometry import Point
|
||||||
|
from utils.logger import setup_logger
|
||||||
|
from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
|
||||||
|
from datatypes.datatypes import OpenUprnCoordinateData
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialBuildingsClient:
|
||||||
|
"""
|
||||||
|
This class reads in data from Historic England, which can be used to determine if specific buildings are
|
||||||
|
listed or heritage buildings
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
|
||||||
|
self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
|
||||||
|
self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
|
||||||
|
self.bucket = bucket
|
||||||
|
|
||||||
|
self.historic_england_listed_buildings = None
|
||||||
|
self.historic_england_heritage_buildings = None
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
"""
|
||||||
|
Read the data
|
||||||
|
"""
|
||||||
|
logger.info("Reading in historic england listed buildings shapefile")
|
||||||
|
self.historic_england_listed_buildings = read_shapefile_from_s3(
|
||||||
|
bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Reading in historic england heritage buildings shapefile")
|
||||||
|
self.historic_england_heritage_buildings = read_shapefile_from_s3(
|
||||||
|
bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert the gov data to british national grid co-ordinates
|
||||||
|
self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
|
||||||
|
|
||||||
|
def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a location specified by British National Grid coordinates is a listed building.
|
||||||
|
|
||||||
|
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||||
|
:return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
|
||||||
|
"""
|
||||||
|
# Convert the coordinates to a Shapely Point object
|
||||||
|
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||||
|
|
||||||
|
# Check if the point is within any of the listed building polygons
|
||||||
|
within_listed_buildings = self.historic_england_listed_buildings.contains(point)
|
||||||
|
|
||||||
|
if within_listed_buildings.any():
|
||||||
|
# If the point is within any listed building polygon, log the names of the buildings and return
|
||||||
|
# "listed_building"
|
||||||
|
names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
|
||||||
|
logger.info(f"The location is within the following listed buildings: {names.values}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the point is not within any listed building polygon, return "not_listed_building"
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||||
|
# Check against historic England listed buildings data
|
||||||
|
joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
|
||||||
|
|
||||||
|
# Identify where we have matches
|
||||||
|
uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
|
||||||
|
|
||||||
|
# Populate the results in the input GeoDataFrame
|
||||||
|
uprn_gdf['is_listed_building'] = False
|
||||||
|
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
|
||||||
|
|
||||||
|
return uprn_gdf
|
||||||
|
|
||||||
|
def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a location specified by British National Grid coordinates is a heritage building at risk.
|
||||||
|
|
||||||
|
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
|
||||||
|
:return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
|
||||||
|
"not_heritage_building_at_risk" otherwise
|
||||||
|
"""
|
||||||
|
# Convert the coordinates to a Shapely Point object
|
||||||
|
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
|
||||||
|
|
||||||
|
# Check if the point is within any of the heritage building at risk polygons
|
||||||
|
within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
|
||||||
|
|
||||||
|
if within_heritage_buildings_at_risk.any():
|
||||||
|
# If the point is within any heritage building at risk polygon, log the names of the buildings and return
|
||||||
|
# "heritage_building_at_risk"
|
||||||
|
names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
|
||||||
|
logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
|
||||||
|
# Check against historic England heritage buildings data
|
||||||
|
joined_gdf_heritage = gpd.sjoin(
|
||||||
|
uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Identify where we have matches
|
||||||
|
uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
|
||||||
|
|
||||||
|
# Populate the results in the input GeoDataFrame
|
||||||
|
uprn_gdf['is_heritage_building'] = False
|
||||||
|
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
|
||||||
|
|
||||||
|
return uprn_gdf
|
||||||
0
etl/spatial/__init__.py
Normal file
0
etl/spatial/__init__.py
Normal file
103
etl/spatial/app.py
Normal file
103
etl/spatial/app.py
Normal file
|
|
@ -0,0 +1,103 @@
|
||||||
|
"""
|
||||||
|
This application reads in the open uprn data from a static location and loads it into
|
||||||
|
our database for querying from other services
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
from etl.spatial.ConservationAreaClient import ConservationAreaClient
|
||||||
|
from etl.spatial.OpenUprnClient import OpenUprnClient
|
||||||
|
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
|
||||||
|
from utils.logger import setup_logger
|
||||||
|
from utils.s3 import save_dataframe_to_s3_parquet
|
||||||
|
|
||||||
|
BUCKET = "retrofit-datalake-dev"
|
||||||
|
OUTPUT_BUCKET = "retrofit-data-dev"
|
||||||
|
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
|
||||||
|
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
|
||||||
|
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
|
||||||
|
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
|
||||||
|
"NHLE)/Listed_Building_polygons.shp"
|
||||||
|
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
|
||||||
|
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
|
||||||
|
|
||||||
|
logger = setup_logger()
|
||||||
|
|
||||||
|
|
||||||
|
def app():
|
||||||
|
"""
|
||||||
|
This application uses the conservation area datasets to determine if a UPRN is
|
||||||
|
in a conservation area or now
|
||||||
|
|
||||||
|
We use two sources of data for determining if homes are in conservation areas.
|
||||||
|
The first is the Historic England dataset, which is a shapefile containing
|
||||||
|
polygons of conservation areas. The second is the gov.uk dataset, which is a
|
||||||
|
geojson file containing polygons of conservation areas.
|
||||||
|
|
||||||
|
The Historic England dataset can be found here:
|
||||||
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||||
|
|
||||||
|
The listed building dataset is also found at Historic England at:
|
||||||
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||||
|
|
||||||
|
The hertitige buildings dataset is also found at Historic England at:
|
||||||
|
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
|
||||||
|
|
||||||
|
The Gov.uk dataset can be found here:
|
||||||
|
https://www.planning.data.gov.uk/dataset/conservation-area
|
||||||
|
|
||||||
|
The open UPRN data can be found here:
|
||||||
|
https://osdatahub.os.uk/downloads/open/OpenUPRN
|
||||||
|
|
||||||
|
The Office for National Statistics Postcode Lookup can be found here:
|
||||||
|
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
|
||||||
|
|
||||||
|
For the moment, these data sources are downloaded manually and uploaded to S3.
|
||||||
|
This application then processes those files and writes the results to s3
|
||||||
|
"""
|
||||||
|
|
||||||
|
conservation_area_client = ConservationAreaClient(
|
||||||
|
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
|
||||||
|
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
|
||||||
|
bucket=BUCKET
|
||||||
|
)
|
||||||
|
conservation_area_client.read()
|
||||||
|
|
||||||
|
special_buildings_client = SpecialBuildingsClient(
|
||||||
|
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
|
||||||
|
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
|
||||||
|
bucket=BUCKET
|
||||||
|
)
|
||||||
|
special_buildings_client.read()
|
||||||
|
|
||||||
|
open_uprn_client = OpenUprnClient(
|
||||||
|
path=OPEN_UPRN_PATHNAME,
|
||||||
|
bucket=BUCKET
|
||||||
|
)
|
||||||
|
open_uprn_client.read()
|
||||||
|
|
||||||
|
# We want to sort the data and split it into filenames on UPRN.
|
||||||
|
# We'll split the data into chunks of 50,000
|
||||||
|
open_uprn_client.create_file_partitions()
|
||||||
|
|
||||||
|
logger.info("Extracting spatial data for uprn partitions")
|
||||||
|
to_loop_over = open_uprn_client.data.groupby("filename")
|
||||||
|
|
||||||
|
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
|
||||||
|
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
|
||||||
|
|
||||||
|
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
|
||||||
|
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
|
||||||
|
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
|
||||||
|
|
||||||
|
# Convert back to a regular dataframe
|
||||||
|
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
|
||||||
|
uprn_gdf = pd.DataFrame(uprn_gdf)
|
||||||
|
|
||||||
|
save_dataframe_to_s3_parquet(
|
||||||
|
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
|
||||||
|
)
|
||||||
|
|
||||||
|
# We finally save the filesnames to s3
|
||||||
|
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
import pytest
|
import pytest
|
||||||
from model_data.BoreholeClient import BoreholeClient
|
from etl.spatial.BoreholeClient import BoreholeClient
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
0
etl/wall_area/__init__.py
Normal file
0
etl/wall_area/__init__.py
Normal file
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
|
This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
|
||||||
of insulation measures within homes
|
of insulation measures within homes
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
|
@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
|
||||||
publicly_accessible = true
|
publicly_accessible = true
|
||||||
}
|
}
|
||||||
|
|
||||||
# Set up the bucket that recieve the csv uploads of properties to be retrofit
|
# Set up the bucket that recieve the csv uploads of epc to be retrofit
|
||||||
module "s3_presignable_bucket" {
|
module "s3_presignable_bucket" {
|
||||||
source = "./modules/s3_presignable_bucket"
|
source = "./modules/s3_presignable_bucket"
|
||||||
bucketname = "retrofit-plan-inputs-${var.stage}"
|
bucketname = "retrofit-plan-inputs-${var.stage}"
|
||||||
|
|
|
||||||
12
input_property_list.csv
Normal file
12
input_property_list.csv
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
address,postcode,Notes,,,,
|
||||||
|
28 Distillery Wharf,W6 9bf,,,,,
|
||||||
|
Flat 14 Godley V C House,E2 0LP,,,,,
|
||||||
|
49 Elderfield Road,E5 0LF,,,,,
|
||||||
|
26 Stanhope Road,N6 5NG,,,,,
|
||||||
|
Flat 3 Frederick Building,N1 4BD,,,,,
|
||||||
|
Flat 4 Frederick Building,N1 4BD,,,,,
|
||||||
|
"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
|
||||||
|
"Flat 39, 239 Long Lane",SE1 4PT,,,,,
|
||||||
|
"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
|
||||||
|
"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
|
||||||
|
88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
|
||||||
|
|
|
@ -1,49 +0,0 @@
|
||||||
# Environment setup
|
|
||||||
|
|
||||||
We're using conda to manage environments to circumvent the
|
|
||||||
issues with Mac M1. This documentation will also cover Pycharm setup.
|
|
||||||
|
|
||||||
We're working in python 3.10 so
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
conda create -n hestia-data python=3.10
|
|
||||||
```
|
|
||||||
|
|
||||||
Then activate the environment
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
conda activate hestia-data
|
|
||||||
```
|
|
||||||
|
|
||||||
To set up with Pycharm, run
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
which python
|
|
||||||
```
|
|
||||||
|
|
||||||
and grab the path to the python executable. Then in Pycharm, go to
|
|
||||||
Settings > Project > Python Interpreter and click the gear icon
|
|
||||||
to add a new interpreter. Select Conda and either paste the path to the python executable
|
|
||||||
and click OK, or select the conda environment from the dropdown.
|
|
||||||
|
|
||||||
You may need to restart Pycharm for the new interpreter to be recognised.
|
|
||||||
|
|
||||||
To install project dependencies navigate to /model_data and run
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running Tests
|
|
||||||
|
|
||||||
If you are not in a virtual environment, activate it with
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
conda activate envName
|
|
||||||
```
|
|
||||||
|
|
||||||
Then run
|
|
||||||
|
|
||||||
```commandline
|
|
||||||
pytest --cov-config=model_data/.coveragerc --cov=model_data
|
|
||||||
```
|
|
||||||
|
|
@ -1,650 +0,0 @@
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import statsmodels.api as sm
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from typing import Dict, Optional, List
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
|
||||||
median_absolute_error, mean_absolute_percentage_error
|
|
||||||
from sklearn.ensemble import RandomForestRegressor
|
|
||||||
from sklearn.inspection import permutation_importance
|
|
||||||
from model_data.EpcClean import EpcClean
|
|
||||||
|
|
||||||
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
|
||||||
from tqdm import tqdm
|
|
||||||
from utils.logger import setup_logger
|
|
||||||
|
|
||||||
logger = setup_logger()
|
|
||||||
|
|
||||||
|
|
||||||
class SapModel:
|
|
||||||
# We want to estimate for making improvements on different property components
|
|
||||||
RESPONSE = "current-energy-efficiency"
|
|
||||||
# We could potentially build models by constituency to avoid having too many
|
|
||||||
# features in the model
|
|
||||||
BASE_FEATURES = [
|
|
||||||
"property-type",
|
|
||||||
"built-form",
|
|
||||||
"construction-age-band",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"constituency",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"transaction-type"
|
|
||||||
]
|
|
||||||
|
|
||||||
COMPONENT_FEATURES = [
|
|
||||||
"walls-description",
|
|
||||||
"floor-description",
|
|
||||||
"lighting-description",
|
|
||||||
"roof-description",
|
|
||||||
"mainheat-description",
|
|
||||||
"hotwater-description",
|
|
||||||
"main-fuel",
|
|
||||||
"mechanical-ventilation",
|
|
||||||
"secondheat-description",
|
|
||||||
"energy-tariff",
|
|
||||||
"solar-water-heating-flag",
|
|
||||||
"photo-supply",
|
|
||||||
"windows-description",
|
|
||||||
"glazed-type",
|
|
||||||
"glazed-area",
|
|
||||||
"multi-glaze-proportion",
|
|
||||||
# "lighting-description" # Might not need to use this
|
|
||||||
"low-energy-lighting",
|
|
||||||
"number-open-fireplaces",
|
|
||||||
"mainheatcont-description",
|
|
||||||
"fixed-lighting-outlets-count",
|
|
||||||
"floor-height",
|
|
||||||
"floor-level",
|
|
||||||
"total-floor-area",
|
|
||||||
"extension-count",
|
|
||||||
]
|
|
||||||
|
|
||||||
CATEGORICAL_COLS = [
|
|
||||||
"property-type",
|
|
||||||
"built-form",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"constituency",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"mainheat-description",
|
|
||||||
"hotwater-description",
|
|
||||||
"main-fuel",
|
|
||||||
"mechanical-ventilation",
|
|
||||||
"secondheat-description",
|
|
||||||
"energy-tariff",
|
|
||||||
"solar-water-heating-flag",
|
|
||||||
"windows-description",
|
|
||||||
"glazed-type",
|
|
||||||
"glazed-area",
|
|
||||||
"construction-age-band",
|
|
||||||
"lighting-description",
|
|
||||||
"mainheatcont-description",
|
|
||||||
"floor-level",
|
|
||||||
]
|
|
||||||
|
|
||||||
NUMERICAL_COLUMNS = [
|
|
||||||
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
|
|
||||||
"fixed-lighting-outlets-count",
|
|
||||||
"floor-height",
|
|
||||||
"total-floor-area",
|
|
||||||
"extension-count",
|
|
||||||
]
|
|
||||||
|
|
||||||
# For the moment, we store records of the best performing models as a benchmark for future imporvements
|
|
||||||
BEST_FIT = {
|
|
||||||
'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
|
|
||||||
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
|
|
||||||
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
|
|
||||||
}
|
|
||||||
|
|
||||||
BEST_PREDICT = {
|
|
||||||
'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
|
|
||||||
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
|
|
||||||
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
|
|
||||||
}
|
|
||||||
|
|
||||||
BEST_FINAL = {
|
|
||||||
'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
|
|
||||||
'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
|
|
||||||
'Median Absolute Error': 1.9487883489495985
|
|
||||||
}
|
|
||||||
|
|
||||||
BUCKET_VARIABLES = [
|
|
||||||
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, data: List[Dict],
|
|
||||||
cleaner: EpcClean,
|
|
||||||
test_size: Optional[float] = 0.2,
|
|
||||||
random_state: Optional[int] = None
|
|
||||||
):
|
|
||||||
self.df = pd.DataFrame(data)
|
|
||||||
self.cleaner = cleaner
|
|
||||||
self.random_state = random_state if random_state is not None else 42
|
|
||||||
self.test_size = 0.2 if test_size is None else test_size
|
|
||||||
|
|
||||||
self.model_data = None
|
|
||||||
self.train_x = None
|
|
||||||
self.train_y = None
|
|
||||||
self.test_x = None
|
|
||||||
self.test_y = None
|
|
||||||
|
|
||||||
self.test_model = None
|
|
||||||
self.final_model = None
|
|
||||||
|
|
||||||
self.fit_error = None
|
|
||||||
self.predict_error = None
|
|
||||||
self.final_error = None
|
|
||||||
self.worst = {
|
|
||||||
"fit_errors": pd.DataFrame(),
|
|
||||||
"prediction_errors": pd.DataFrame(),
|
|
||||||
"fit_x": pd.DataFrame(),
|
|
||||||
"prediction_x": pd.DataFrame(),
|
|
||||||
"final_errors": pd.DataFrame(),
|
|
||||||
"final_x": pd.DataFrame(),
|
|
||||||
}
|
|
||||||
|
|
||||||
self.fit_df = None
|
|
||||||
self.predict_df = None
|
|
||||||
self.final_fit_df = None
|
|
||||||
self.diagnosis = {}
|
|
||||||
|
|
||||||
def run(self, plot: bool = False) -> None:
|
|
||||||
"""
|
|
||||||
A pipeline method to run all necessary methods in correct order.
|
|
||||||
:param plot: Boolean to indicate whether to plot the regression
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
self.create_dataset()
|
|
||||||
self.fit_model()
|
|
||||||
if plot:
|
|
||||||
self.plot_regression(self.fit_df)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("An error occurred during execution.")
|
|
||||||
logger.error(str(e))
|
|
||||||
|
|
||||||
def _merge_with_u_values(
|
|
||||||
self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
|
|
||||||
"""
|
|
||||||
Utility function to merge u value data with model data
|
|
||||||
:param model_data: Pandas dataframe which is the main modelling dataset
|
|
||||||
:param description: Name of the description column for which we're merging u-values onto
|
|
||||||
:param thermal_transmittance: Name of the thermal transmittance column
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
|
|
||||||
["original_description", thermal_transmittance]].rename(
|
|
||||||
columns={thermal_transmittance: f"{description}_u_value"}
|
|
||||||
)
|
|
||||||
|
|
||||||
model_data = model_data.merge(
|
|
||||||
u_values,
|
|
||||||
how="left",
|
|
||||||
left_on=f"{description}-description",
|
|
||||||
right_on="original_description"
|
|
||||||
).drop(columns=["original_description"])
|
|
||||||
|
|
||||||
return model_data
|
|
||||||
|
|
||||||
def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Appends cleaned data into the model data.
|
|
||||||
:param model_data: Original model data.
|
|
||||||
:return: Model data with cleaned data appended.
|
|
||||||
"""
|
|
||||||
for description in ["walls", "floor", "roof"]:
|
|
||||||
model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
|
|
||||||
|
|
||||||
# lighting_proportions added separately as it doesn't use the _merge_with_u_values method
|
|
||||||
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
|
|
||||||
["original_description", "low_energy_proportion"]]
|
|
||||||
|
|
||||||
model_data = model_data.merge(
|
|
||||||
lighting_proportions,
|
|
||||||
how="left",
|
|
||||||
left_on="lighting-description",
|
|
||||||
right_on="original_description"
|
|
||||||
).drop(columns=["original_description"])
|
|
||||||
|
|
||||||
return model_data
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Converts transaction type to boolean
|
|
||||||
:param model_data: Model data with transaction type.
|
|
||||||
:return: Model data with converted transaction type.
|
|
||||||
"""
|
|
||||||
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
|
|
||||||
model_data = model_data.drop(columns=["transaction-type"])
|
|
||||||
return model_data
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
|
|
||||||
:param df: Dataframe of features to be binned
|
|
||||||
:param column_name: Name of the column to be binned
|
|
||||||
:param n_bins: Number of bins to use
|
|
||||||
:return: Dataframe with binned column
|
|
||||||
"""
|
|
||||||
# Check if the column is numerical
|
|
||||||
if np.issubdtype(df[column_name].dtype, np.number):
|
|
||||||
# Create a new categorical column from numerical one by binning the data
|
|
||||||
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
|
|
||||||
# Replace missing data with "NO_RECORD"
|
|
||||||
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
|
|
||||||
df[column_name + "_bucket"] = np.where(
|
|
||||||
df[column_name + "_bucket"] == "nan",
|
|
||||||
"NO_RECORD",
|
|
||||||
df[column_name + "_bucket"]
|
|
||||||
)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def _clean_numericals(self, model_data):
|
|
||||||
|
|
||||||
# Try binning numericals
|
|
||||||
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
|
|
||||||
|
|
||||||
for col in self.BUCKET_VARIABLES:
|
|
||||||
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
|
|
||||||
# If all values are missing, set all values to 0 - this column will get dropped
|
|
||||||
if all(pd.isnull(model_data[col])):
|
|
||||||
model_data[col + "_bucket"] = "NO_RECORD"
|
|
||||||
continue
|
|
||||||
model_data = self.bucket_and_fill(model_data, col)
|
|
||||||
|
|
||||||
# Replace the data with the binned version
|
|
||||||
model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
|
|
||||||
model_data = model_data.rename(
|
|
||||||
columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
|
|
||||||
)
|
|
||||||
|
|
||||||
# Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
|
|
||||||
for col in remaining_numericals:
|
|
||||||
model_data[col] = np.where(
|
|
||||||
model_data[col] == "", "0", model_data[col]
|
|
||||||
).astype(float)
|
|
||||||
|
|
||||||
return model_data
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Fills categorical missing data with sensible values
|
|
||||||
:param model_data: Original model data.
|
|
||||||
:return: Model data with cleaned categorical data.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
|
|
||||||
# potentially
|
|
||||||
# a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
|
|
||||||
|
|
||||||
model_data["mechanical-ventilation"] = np.where(
|
|
||||||
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
|
|
||||||
)
|
|
||||||
|
|
||||||
model_data["solar-water-heating-flag"] = np.where(
|
|
||||||
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
|
|
||||||
)
|
|
||||||
|
|
||||||
model_data["glazed-type"] = np.where(
|
|
||||||
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
|
|
||||||
)
|
|
||||||
|
|
||||||
model_data["glazed-area"] = np.where(
|
|
||||||
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
|
|
||||||
)
|
|
||||||
|
|
||||||
return model_data
|
|
||||||
|
|
||||||
def create_dataset(self):
|
|
||||||
logger.info("Creating modelling dataset")
|
|
||||||
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
|
||||||
model_data = model_data.reset_index(drop=True)
|
|
||||||
model_data["idx"] = model_data.index.copy()
|
|
||||||
|
|
||||||
# Append on u-values
|
|
||||||
model_data = self._append_cleaned_data(model_data)
|
|
||||||
|
|
||||||
model_data = self.clean_missings(model_data)
|
|
||||||
|
|
||||||
# Convert transaction_type
|
|
||||||
model_data = self._convert_transaction_type(model_data)
|
|
||||||
|
|
||||||
# Clean numerical columns
|
|
||||||
model_data = self._clean_numericals(model_data)
|
|
||||||
|
|
||||||
# Take just entries with U-values
|
|
||||||
# TODO: Rather than doing this, do we want to include the estimated u-values?
|
|
||||||
# Since this ends up with just 2k entries
|
|
||||||
model_data = model_data[
|
|
||||||
~pd.isnull(model_data["walls_u_value"]) &
|
|
||||||
~pd.isnull(model_data["floor_u_value"]) &
|
|
||||||
~pd.isnull(model_data["roof_u_value"])
|
|
||||||
]
|
|
||||||
|
|
||||||
exclude_features = [
|
|
||||||
"walls-description", "floor-description", "roof-description", "transaction-type"
|
|
||||||
]
|
|
||||||
|
|
||||||
features = [
|
|
||||||
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
|
|
||||||
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
|
|
||||||
] if x not in exclude_features
|
|
||||||
]
|
|
||||||
|
|
||||||
model_data = model_data[features]
|
|
||||||
|
|
||||||
for col in self.CATEGORICAL_COLS:
|
|
||||||
model_data[col] = model_data[col].astype('category')
|
|
||||||
|
|
||||||
# Convert response
|
|
||||||
model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
|
|
||||||
|
|
||||||
self.model_data = model_data
|
|
||||||
|
|
||||||
def make_training_test(self, x):
|
|
||||||
# Split into training and test
|
|
||||||
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
|
|
||||||
x.drop(self.RESPONSE, axis=1),
|
|
||||||
x[self.RESPONSE],
|
|
||||||
test_size=self.test_size,
|
|
||||||
random_state=self.random_state
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
|
|
||||||
"""
|
|
||||||
Utility function to remove columns that have zero standard deviation from both test and train sets
|
|
||||||
:param train_x: Training data to remove columns from
|
|
||||||
:param test_x: If provided, remove the same columns from the test data
|
|
||||||
:param threshold: float value, if the standard deviation is below this threshold, the column is considered
|
|
||||||
to have zero standard deviation
|
|
||||||
:return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
|
|
||||||
"""
|
|
||||||
# Compute standard deviations
|
|
||||||
std_devs = train_x.std()
|
|
||||||
|
|
||||||
# Find columns with zero or near-zero standard deviation
|
|
||||||
zero_std_cols = std_devs[std_devs <= threshold].index
|
|
||||||
|
|
||||||
# Drop these columns from the training data
|
|
||||||
train_x = train_x.drop(zero_std_cols, axis=1)
|
|
||||||
|
|
||||||
if test_x is not None:
|
|
||||||
# Ensure the test data has the same columns
|
|
||||||
test_x = test_x[train_x.columns]
|
|
||||||
return train_x, test_x
|
|
||||||
|
|
||||||
return train_x, None
|
|
||||||
|
|
||||||
def fit_model(self):
|
|
||||||
"""
|
|
||||||
Main function to fit the model and produce accuracy metrics
|
|
||||||
"""
|
|
||||||
|
|
||||||
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
|
|
||||||
|
|
||||||
# Convert booleans to integer
|
|
||||||
for col in x.columns:
|
|
||||||
if x[col].dtype == bool:
|
|
||||||
x[col] = x[col].astype(int)
|
|
||||||
|
|
||||||
if x[col].dtype == object:
|
|
||||||
x[col] = x[col].astype(float)
|
|
||||||
|
|
||||||
# Create the training and test sets for each run
|
|
||||||
self.make_training_test(x)
|
|
||||||
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
|
|
||||||
logger.info("Detecting multi-collinearity in training dataset")
|
|
||||||
self.detect_multi_collinearity()
|
|
||||||
|
|
||||||
# Add a constant to the independent value
|
|
||||||
train_x = sm.add_constant(self.train_x)
|
|
||||||
test_x = sm.add_constant(self.test_x)
|
|
||||||
train_idx = train_x["idx"].copy()
|
|
||||||
test_idx = self.test_x["idx"].copy()
|
|
||||||
train_x = train_x.drop(columns=["idx"])
|
|
||||||
test_x = test_x.drop(columns=["idx"])
|
|
||||||
|
|
||||||
logger.info("Fitting testing model")
|
|
||||||
# make regression model
|
|
||||||
model = sm.OLS(self.train_y, train_x)
|
|
||||||
# fit model and print results
|
|
||||||
self.test_model = model.fit()
|
|
||||||
|
|
||||||
train_predictions = self.test_model.fittedvalues
|
|
||||||
test_predictions = self.test_model.predict(test_x)
|
|
||||||
|
|
||||||
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
|
|
||||||
y_true=self.train_y, y_pred=train_predictions
|
|
||||||
)
|
|
||||||
|
|
||||||
# Predict on new data
|
|
||||||
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
|
|
||||||
y_true=self.test_y, y_pred=test_predictions
|
|
||||||
)
|
|
||||||
|
|
||||||
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
|
|
||||||
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
|
|
||||||
|
|
||||||
self.model_data['fit'] = self.test_model.fittedvalues
|
|
||||||
# The worst errors over index heavily for flats
|
|
||||||
self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
|
|
||||||
self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
|
|
||||||
|
|
||||||
self.fit_df = pd.DataFrame(
|
|
||||||
{
|
|
||||||
"fit": train_predictions,
|
|
||||||
"actual": self.train_y,
|
|
||||||
"idx": train_idx
|
|
||||||
}
|
|
||||||
).sort_values("actual", ascending=True)
|
|
||||||
|
|
||||||
self.predict_df = pd.DataFrame(
|
|
||||||
{
|
|
||||||
"predictions": test_predictions,
|
|
||||||
"actual": self.test_y,
|
|
||||||
"idx": test_idx
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
self.diagnosis = {
|
|
||||||
"fit_success": fit_success,
|
|
||||||
"predict_success": predict_success,
|
|
||||||
"summary": self.test_model.summary()
|
|
||||||
}
|
|
||||||
|
|
||||||
# We're now ready to fit the final model
|
|
||||||
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
|
|
||||||
# just need to remove the columns that were removed from the training data from the final model
|
|
||||||
logger.info("Fitting final model")
|
|
||||||
x = sm.add_constant(x)
|
|
||||||
y = x[self.RESPONSE]
|
|
||||||
x = x[self.train_x.columns]
|
|
||||||
idx = x["idx"].copy()
|
|
||||||
x = x.drop(columns=["idx"])
|
|
||||||
|
|
||||||
final_model = sm.OLS(y, x)
|
|
||||||
# fit model and print results
|
|
||||||
self.final_model = final_model.fit()
|
|
||||||
final_predictions = self.final_model.fittedvalues
|
|
||||||
|
|
||||||
self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
|
|
||||||
y_true=y, y_pred=final_predictions
|
|
||||||
)
|
|
||||||
|
|
||||||
self.final_fit_df = pd.DataFrame(
|
|
||||||
{
|
|
||||||
"fit": final_predictions,
|
|
||||||
"actual": y,
|
|
||||||
"idx": idx
|
|
||||||
}
|
|
||||||
).sort_values("actual", ascending=True)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def check_successes(experiment_error, best_error):
|
|
||||||
"""
|
|
||||||
Simple function to check if the experiment error is better than the best error
|
|
||||||
:param experiment_error: output of calculate_regression_metrics() on the experiment
|
|
||||||
:param best_error: Current benchmark best error
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
|
|
||||||
successes = []
|
|
||||||
for k in experiment_error:
|
|
||||||
if k in ["Explained Variance Score", "R2 Score"]:
|
|
||||||
# We want to maximise this so we want experiment error to be higher
|
|
||||||
successes.append(
|
|
||||||
{
|
|
||||||
"measure": k,
|
|
||||||
"success": experiment_error[k] >= best_error[k],
|
|
||||||
"difference": abs(experiment_error[k] - best_error[k])
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
successes.append(
|
|
||||||
{
|
|
||||||
"measure": k,
|
|
||||||
"success": experiment_error[k] <= best_error[k],
|
|
||||||
"difference": abs(experiment_error[k] - best_error[k])
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return pd.DataFrame(successes)
|
|
||||||
|
|
||||||
def rf_importance(self, train_x, train_y, test_x, test_y):
|
|
||||||
"""
|
|
||||||
Utility function to estimate feature importance using a random forest
|
|
||||||
This is useful to get a sense of some of the key features which are driving model
|
|
||||||
performance
|
|
||||||
|
|
||||||
:param train_x: Training data covariates to build the importance model on
|
|
||||||
:param train_y: Training data response to build the importance model on
|
|
||||||
:param test_x: Test data covariates to build the permutation importance model on
|
|
||||||
:param test_y: Test data response to build the permutation importance model on
|
|
||||||
:return: Pandas dataframe of feature importances, ranked by most important to least
|
|
||||||
"""
|
|
||||||
|
|
||||||
rf = RandomForestRegressor(random_state=self.random_state)
|
|
||||||
rf.fit(train_x, train_y)
|
|
||||||
|
|
||||||
# Print the name and importance of each feature
|
|
||||||
rf_importance_df = []
|
|
||||||
for feature, importance in zip(train_x.columns, rf.feature_importances_):
|
|
||||||
rf_importance_df.append(
|
|
||||||
{
|
|
||||||
"Feature": feature,
|
|
||||||
"rf_importance": importance
|
|
||||||
}
|
|
||||||
)
|
|
||||||
rf_importance_df = pd.DataFrame(rf_importance_df)
|
|
||||||
rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
|
|
||||||
|
|
||||||
perm_importance = self.permuation_importance(rf, test_x, test_y)
|
|
||||||
|
|
||||||
return rf_importance_df, perm_importance
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def permuation_importance(rf, test_x, test_y):
|
|
||||||
"""
|
|
||||||
Simple utility function to produce permutation importance for a given model\
|
|
||||||
:param rf: Random forest model to calculate permutation importance for
|
|
||||||
:param test_x: Test covariates to be used for permutation importance
|
|
||||||
:param test_y: Test response to be used for permutation importance
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
|
|
||||||
perm_importance_df = pd.DataFrame(
|
|
||||||
{
|
|
||||||
"Feature": test_x.columns,
|
|
||||||
"perm_importance": perm_importance.importances_mean
|
|
||||||
}
|
|
||||||
).sort_values(by="perm_importance", ascending=False)
|
|
||||||
|
|
||||||
return perm_importance_df
|
|
||||||
|
|
||||||
def detect_multi_collinearity(self):
|
|
||||||
# Get the VIFs for each variable
|
|
||||||
vifs = pd.DataFrame()
|
|
||||||
vifs["features"] = self.train_x.columns
|
|
||||||
vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
|
|
||||||
|
|
||||||
# Get the features with the highest VIF
|
|
||||||
vifs = vifs.sort_values("vif", ascending=False)
|
|
||||||
|
|
||||||
# There are some features, we do not want to remove
|
|
||||||
required_features = [
|
|
||||||
"walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap"
|
|
||||||
]
|
|
||||||
|
|
||||||
vifs = vifs[~vifs["features"].isin(required_features)]
|
|
||||||
drop_vifs = vifs[np.isinf(vifs["vif"])]
|
|
||||||
|
|
||||||
# Acceptable drop variables:
|
|
||||||
# main-fuel_Gas: mains gas
|
|
||||||
# glazed-type_NO DATA!
|
|
||||||
# glazed-area_NO DATA!
|
|
||||||
|
|
||||||
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
|
|
||||||
self.test_x = self.test_x[self.train_x.columns]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def plot_regression(df):
|
|
||||||
# Extract the "fit" and "actual" columns from the dataframe
|
|
||||||
fit = df['fit']
|
|
||||||
actual = df['actual']
|
|
||||||
|
|
||||||
# Create an array of x-values (assumed to be sequential integers)
|
|
||||||
x = np.arange(len(df))
|
|
||||||
|
|
||||||
# Plot the fit and actual data
|
|
||||||
plt.plot(x, fit, color='red', label='Fit')
|
|
||||||
plt.plot(x, actual, color='blue', label='Actual')
|
|
||||||
|
|
||||||
# Set labels and title
|
|
||||||
plt.xlabel('Index')
|
|
||||||
plt.ylabel('Value')
|
|
||||||
plt.title('Linear Regression - Fit vs Actual')
|
|
||||||
|
|
||||||
# Display legend
|
|
||||||
plt.legend()
|
|
||||||
|
|
||||||
# Show the plot
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
|
||||||
"""
|
|
||||||
Calculate the 5 most important accuracy metrics for regression.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
y_true (array-like): Array of true target values.
|
|
||||||
y_pred (array-like): Array of predicted target values.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Dictionary containing the calculated metrics.
|
|
||||||
"""
|
|
||||||
metrics = {
|
|
||||||
'MAPE': mean_absolute_percentage_error(y_true, y_pred),
|
|
||||||
'Mean Squared Error': mean_squared_error(y_true, y_pred),
|
|
||||||
'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
|
|
||||||
'R2 Score': r2_score(y_true, y_pred),
|
|
||||||
'Explained Variance Score': explained_variance_score(y_true, y_pred),
|
|
||||||
'Median Absolute Error': median_absolute_error(y_true, y_pred)
|
|
||||||
}
|
|
||||||
|
|
||||||
errors = pd.DataFrame()
|
|
||||||
errors['Fit'] = y_true
|
|
||||||
errors['Actual'] = y_pred
|
|
||||||
errors['Residual'] = errors['Actual'] - errors['Fit']
|
|
||||||
errors['Absolute Residual'] = np.abs(errors['Residual'])
|
|
||||||
|
|
||||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
|
||||||
|
|
||||||
return metrics, worst_errors
|
|
||||||
|
|
@ -1,207 +0,0 @@
|
||||||
import pickle
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
from model_data.EpcClean import EpcClean
|
|
||||||
|
|
||||||
|
|
||||||
class UvalueEstimations:
|
|
||||||
def __init__(self, data: list):
|
|
||||||
"""
|
|
||||||
Initialize the UvalueEstimations class.
|
|
||||||
|
|
||||||
:param data: The input data as a list of dictionaries, to be converted to a dataframe
|
|
||||||
"""
|
|
||||||
self.data = pd.DataFrame(data)
|
|
||||||
self.walls = None
|
|
||||||
self.walls_decile_data = {}
|
|
||||||
self.roofs = None
|
|
||||||
self.floors = None
|
|
||||||
self.floors_decile_data = {}
|
|
||||||
|
|
||||||
def get_estimates(self, cleaner: EpcClean):
|
|
||||||
"""
|
|
||||||
Calculate U-value estimates for walls, roofs, and floors.
|
|
||||||
|
|
||||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
|
||||||
"""
|
|
||||||
self.set_walls(cleaner)
|
|
||||||
self.set_roofs(cleaner)
|
|
||||||
self.set_floors(cleaner)
|
|
||||||
|
|
||||||
def set_walls(self, cleaner: EpcClean):
|
|
||||||
"""
|
|
||||||
Set U-value estimates for walls.
|
|
||||||
|
|
||||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
|
||||||
"""
|
|
||||||
walls_columns = [
|
|
||||||
"local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form",
|
|
||||||
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
|
|
||||||
]
|
|
||||||
|
|
||||||
walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")]
|
|
||||||
|
|
||||||
# Take just the columns we want
|
|
||||||
walls_df = walls_df[walls_columns]
|
|
||||||
walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float)
|
|
||||||
|
|
||||||
walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area")
|
|
||||||
|
|
||||||
# We now get the U-values
|
|
||||||
walls_df = walls_df.merge(
|
|
||||||
pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]],
|
|
||||||
how="left",
|
|
||||||
right_on="original_description",
|
|
||||||
left_on="walls-description"
|
|
||||||
)
|
|
||||||
|
|
||||||
u_value_summary = walls_df.groupby(
|
|
||||||
[
|
|
||||||
"local-authority",
|
|
||||||
"property-type",
|
|
||||||
"walls-energy-eff",
|
|
||||||
"walls-env-eff",
|
|
||||||
"built-form",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"total-floor-area_group"
|
|
||||||
],
|
|
||||||
observed=True
|
|
||||||
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
|
|
||||||
|
|
||||||
u_value_summary.columns = [
|
|
||||||
"local-authority",
|
|
||||||
"property-type",
|
|
||||||
"walls-energy-eff",
|
|
||||||
"walls-env-eff",
|
|
||||||
"built-form",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"total-floor-area_group",
|
|
||||||
"median_thermal_transmittance",
|
|
||||||
"n_samples"
|
|
||||||
]
|
|
||||||
|
|
||||||
self.walls = u_value_summary
|
|
||||||
self.walls_decile_data = {
|
|
||||||
"decile_labels": decile_labels,
|
|
||||||
"decile_boundaries": decile_boundaries
|
|
||||||
}
|
|
||||||
|
|
||||||
def set_roofs(self, cleaner: EpcClean):
|
|
||||||
"""
|
|
||||||
Set U-value estimates for roofs.
|
|
||||||
|
|
||||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def set_floors(self, cleaner: EpcClean):
|
|
||||||
"""
|
|
||||||
Set U-value estimates for floors.
|
|
||||||
|
|
||||||
:param cleaner: An instance of the EpcClean class used for cleaning data.
|
|
||||||
"""
|
|
||||||
floors_columns = [
|
|
||||||
"local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff",
|
|
||||||
"built-form",
|
|
||||||
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
|
|
||||||
]
|
|
||||||
|
|
||||||
floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")]
|
|
||||||
|
|
||||||
# Take just the columns we want
|
|
||||||
floors_df = floors_df[floors_columns]
|
|
||||||
floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float)
|
|
||||||
|
|
||||||
floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area")
|
|
||||||
|
|
||||||
# We now get the U-values
|
|
||||||
floors_df = floors_df.merge(
|
|
||||||
pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]],
|
|
||||||
how="left",
|
|
||||||
right_on="original_description",
|
|
||||||
left_on="floor-description"
|
|
||||||
)
|
|
||||||
|
|
||||||
u_value_summary = floors_df.groupby(
|
|
||||||
[
|
|
||||||
"local-authority",
|
|
||||||
"property-type",
|
|
||||||
"floor-energy-eff",
|
|
||||||
"floor-env-eff",
|
|
||||||
"built-form",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"total-floor-area_group"
|
|
||||||
],
|
|
||||||
observed=True
|
|
||||||
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
|
|
||||||
|
|
||||||
u_value_summary.columns = [
|
|
||||||
"local-authority",
|
|
||||||
"property-type",
|
|
||||||
"floor-energy-eff",
|
|
||||||
"floor-env-eff",
|
|
||||||
"built-form",
|
|
||||||
"number-habitable-rooms",
|
|
||||||
"number-heated-rooms",
|
|
||||||
"total-floor-area_group",
|
|
||||||
"median_thermal_transmittance",
|
|
||||||
"n_samples"
|
|
||||||
]
|
|
||||||
|
|
||||||
self.floors = u_value_summary
|
|
||||||
self.floors_decile_data = {
|
|
||||||
"decile_labels": decile_labels,
|
|
||||||
"decile_boundaries": decile_boundaries
|
|
||||||
}
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list):
|
|
||||||
"""
|
|
||||||
Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles.
|
|
||||||
|
|
||||||
:param df: The input Pandas DataFrame.
|
|
||||||
:param column: The column name to break into deciles.
|
|
||||||
|
|
||||||
:return: A tuple containing:
|
|
||||||
- The DataFrame with the decile group column.
|
|
||||||
- The list of decile labels.
|
|
||||||
- The list of decile boundaries.
|
|
||||||
"""
|
|
||||||
# Calculate decile boundaries
|
|
||||||
decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10))
|
|
||||||
|
|
||||||
# Create decile labels
|
|
||||||
decile_labels = [f"Decile {i + 1}" for i in range(10)]
|
|
||||||
|
|
||||||
# Assign decile labels to existing values
|
|
||||||
df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels,
|
|
||||||
include_lowest=True)
|
|
||||||
|
|
||||||
return df, decile_labels, decile_boundaries
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list:
|
|
||||||
"""
|
|
||||||
Classify new values into existing deciles based on decile definitions.
|
|
||||||
|
|
||||||
:param decile_boundaries: The list of decile boundaries.
|
|
||||||
:param decile_labels: The list of decile labels.
|
|
||||||
:param new_values: A list of new values to classify.
|
|
||||||
|
|
||||||
:return: The classifications for the new values as a list.
|
|
||||||
"""
|
|
||||||
# Classify new values based on decile definitions
|
|
||||||
classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True)
|
|
||||||
return classifications.tolist()
|
|
||||||
|
|
||||||
def _save(self, filename):
|
|
||||||
"""
|
|
||||||
Useful utility function to store this object, which is particularly handy for unit testing
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
with open(filename, 'wb') as f:
|
|
||||||
pickle.dump(self, f)
|
|
||||||
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
import os
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
|
|
||||||
load_dotenv(dotenv_path='model_data/.env')
|
|
||||||
|
|
||||||
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdown=0.1):
|
|
||||||
offset_from = 0
|
|
||||||
n_completed = 0
|
|
||||||
results = []
|
|
||||||
complete = False
|
|
||||||
while not complete:
|
|
||||||
if verbose:
|
|
||||||
print("Pulling for page %s" % str(int(offset_from / page_size) + 1))
|
|
||||||
time.sleep(slowdown)
|
|
||||||
search_resp = client.domestic.search(params=params, offset_from=offset_from, size=page_size)
|
|
||||||
|
|
||||||
# Note: We can only make 10k queries for a single set of search queries.
|
|
||||||
# It might make sense to download data via zip for machine learning since we don't need this
|
|
||||||
# data to be perfectly up to date
|
|
||||||
if not search_resp:
|
|
||||||
break
|
|
||||||
|
|
||||||
n_completed += 1
|
|
||||||
|
|
||||||
results.extend(search_resp["rows"])
|
|
||||||
if n_completed == n_pages:
|
|
||||||
complete = True
|
|
||||||
else:
|
|
||||||
offset_from += page_size
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
|
|
||||||
def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None):
|
|
||||||
"""
|
|
||||||
Create a heatmap plot based on a list of data and given variables.
|
|
||||||
|
|
||||||
:param data: List of dictionaries, input data.
|
|
||||||
:param response_var: String, response variable to be plotted.
|
|
||||||
:param pivot_var1: String, first pivot variable to be used in the plot.
|
|
||||||
:param pivot_var2: String, second pivot variable to be used in the plot.
|
|
||||||
:param order1: List, the order of categories for pivot_var1. Optional.
|
|
||||||
:param order2: List, the order of categories for pivot_var2. Optional.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
None. Displays the generated plot.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Create a DataFrame from your list of dictionaries
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
|
|
||||||
# Convert the response variable column to float type if it's not already
|
|
||||||
df[response_var] = df[response_var].astype(float)
|
|
||||||
|
|
||||||
# Create a pivot table
|
|
||||||
pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var)
|
|
||||||
|
|
||||||
# If an order is provided, reorder the pivot table
|
|
||||||
if order1 is not None:
|
|
||||||
pivot = pivot.reindex(order1)
|
|
||||||
if order2 is not None:
|
|
||||||
pivot = pivot[order2]
|
|
||||||
|
|
||||||
# Plot the heatmap
|
|
||||||
plt.figure(figsize=(10, 6))
|
|
||||||
sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm')
|
|
||||||
plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}")
|
|
||||||
plt.show()
|
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue