Merge pull request #237 from Hestia-Homes/spatial-data

Spatial data
This commit is contained in:
KhalimCK 2023-10-11 12:32:37 +08:00 committed by GitHub
commit b2142a7f8e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
154 changed files with 1977 additions and 13742 deletions

View file

@ -2,12 +2,8 @@
omit =
*__init__*
*/tests/*
model_data/temp_inputs.py
model_data/config.py
model_data/__init__.py
model_data/app.py
model_data/plotting/*
recommendations/rdsap_tables.py
model_data/simulation_system/*
model_data/cleaner_app.py
*/config.py
*/app.py
*/settings.py
backend/app/*

View file

@ -1,81 +0,0 @@
name: Sap Model Deploy
on:
push:
branches: [ dev, prod ]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.10.12
- name: Install Serverless and plugins
run: |
npm install -g serverless
npm install -g serverless-domain-manager
- name: AWS credentials for dev
if: github.ref == 'refs/heads/dev'
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- name: AWS credentials for prod
if: github.ref == 'refs/heads/prod'
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }}
aws-region: eu-west-2
- name: Set domain name
id: set_domain
run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', github.ref_name)] }}"
- name: Set ECR credentials
id: set_ecr_credentials
run: |
echo "::set-output name=ecr_uri::${{ secrets[format('{0}_SAP_MODEL_ECR_URI', github.ref_name)] }}"
- name: Setup Docker
uses: docker/setup-buildx-action@v1
- name: Login to ECR
run: |
aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
# Building and pushing Docker image with caching
- name: Build and push Docker image
uses: docker/build-push-action@v3
with:
context: ./model_data/simulation_system
file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda
push: true
tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
platform: linux/amd64
provenance: false
- name: Deploy to AWS Lambda via Serverless
env:
RUNTIME_ENVIRONMENT: ${{ github.ref_name }}
MODEL_DIRECTORY_BUCKET: 'retrofit-model-directory-${{ github.ref_name }}'
PREDICTIONS_BUCKET: 'retrofit-sap-predictions-${{ github.ref_name }}'
DATA_BUCKET: 'retrofit-data-${{ github.ref_name }}'
DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }}
ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}
GITHUB_SHA: ${{ github.sha }}
run: |
# Deploy to AWS Lambda via Serverless
sls deploy --config sapmodel.serverless.yml --stage ${{ github.ref_name }} --verbose

6
.gitignore vendored
View file

@ -239,7 +239,8 @@ fabric.properties
.idea/caches/build_file_checksums.ser
# Locally stored data
/model_data/local_data/*
local_data/*
/local_data/*
*.DS_Store
infrastructure/terraform/.terraform*
@ -261,3 +262,6 @@ model_data/simulation_system/predictions/
.idea/Model.iml
.idea/misc.iml
adhoc
adhoc/*

9
.idea/Model.iml generated
View file

@ -7,7 +7,14 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">
<option name="namespacePackageFolders">
<list>
<option value="$MODULE_DIR$/local_data" />
</list>
</option>
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -43,7 +43,9 @@ class Definitions:
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# contain a null value. A resolution to correct these anomalies will be considered for future data releases.
"NULL"
"NULL",
# We sometimes see fields populated with just an empty string.
""
}
DATA_ANOMALY_SUBSTRINGS = {

View file

@ -1,9 +1,22 @@
from datetime import datetime
import re
import os
import pandas as pd
from etl.epc.DataProcessor import DataProcessor
from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
from epc_api.client import EpcClient
from model_data.config import EPC_AUTH_TOKEN
from model_data.BaseUtility import Definitions
from BaseUtility import Definitions
from recommendations.rdsap_tables import england_wales_age_band_lookup
from recommendations.recommendation_utils import estimate_floors, estimate_perimeter, get_wall_type, estimate_wall_area
ENVIRONMENT = os.environ.get('ENVIRONMENT', 'dev')
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')
DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT == 'dev' else None)
logger = setup_logger()
class Property(Definitions):
@ -30,17 +43,27 @@ class Property(Definitions):
lighting = None
coordinates = None
age_band = None
def __init__(self, id, postcode, address1, epc_client=None, data=None):
self.id = id
self.postcode = postcode
self.address1 = address1
self.data = data
self.old_data = None
self.property_dimensions = None
self.uprn = None
self.full_sap_epc = None
self.in_conservation_area = None
self.in_conservation_area, self.is_listed, self.is_heritage = None, None, None
self.restricted_measures = False
self.year_built = None
self.number_of_rooms = None
self.age_band = None
self.construction_age_band = None
self.number_of_floors = None
self.perimeter = None
self.wall_type = None
self.floor_type = None
self.energy = None
self.ventilation = None
@ -83,9 +106,14 @@ class Property(Definitions):
]
if len(newest_response) > 1:
raise Exception("More than one result found for this address - investigate me")
# We'll keep old EPCs in case it contains information, not present on the newest one
self.old_data = [epc for epc in response["rows"] if epc["lmk-key"] != newest_response[0]["lmk-key"]]
response["rows"] = newest_response
self.data = response["rows"][0]
self.uprn = int(self.data["uprn"])
def set_coordinates(self, coordinates):
"""
@ -127,7 +155,7 @@ class Property(Definitions):
"""
ventilation = self.data["mechanical-ventilation"]
# perform some simple cleaning - when checking 300k properties, the only unique values were
# perform some simple cleaning - when checking 300k epc, the only unique values were
# {'', 'mechanical, supply and extract', 'NO DATA!', 'natural', 'mechanical, extract only'}
if ventilation in self.DATA_ANOMALY_MATCHES or ventilation in [""]:
ventilation = None
@ -145,7 +173,7 @@ class Property(Definitions):
- solar_pv
This is based on the "photo-supply" field in the EPC data.
When checking 100k properties, either the value was "" or a stringified number
When checking 100k epc, either the value was "" or a stringified number
"""
solar_pv = self.data["photo-supply"]
@ -244,11 +272,10 @@ class Property(Definitions):
self.set_count_variables()
self.set_heat_loss_corridor()
self.set_mains_gas()
self.set_floor_height()
self.set_wall_area()
self.set_floor_area()
self.set_age_band()
self.set_basic_property_dimensions()
for description, attribute in cleaned.items():
if self.data[description] in self.DATA_ANOMALY_MATCHES:
@ -262,10 +289,19 @@ class Property(Definitions):
attributes = [
x for x in cleaned[description] if x["original_description"] == self.data[description]
]
if len(attributes) != 1:
if len(attributes) > 1:
raise ValueError("Either No attributes or multiple found for %s" % description)
if len(attributes) == 0:
# We attempt to perform the clean on the fly
cleaner_cls = all_cleaner_map[description]
attributes = [cleaner_cls(self.data[description]).process()]
setattr(self, self.ATTRIBUTE_MAP[description], attributes[0])
self.set_wall_type()
self.set_floor_type()
def set_age_band(self):
"""
Sets a cleaned version of the age band of the property given the EPC data
@ -275,14 +311,20 @@ class Property(Definitions):
if not self.data:
raise ValueError("Property does not contain data")
self.age_band = england_wales_age_band_lookup[self.data["construction-age-band"]]
self.construction_age_band = DataProcessor.clean_construction_age_band(self.data["construction-age-band"])
self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
def set_is_in_conservation_area(self, in_conservation_area):
def set_spatial(self, spatial: pd.DataFrame):
"""
Sets whether the property is in a conservation area given the output of the ConservationAreaClient
:param in_conservation_area: string value, indicating whether the property is in a conservation area
:param spatial: Dataframe, containing the spatial data for the property
"""
self.in_conservation_area = in_conservation_area
self.in_conservation_area = spatial["conservation_status"].values[0]
self.is_listed = spatial["is_listed_building"].values[0]
self.is_heritage = spatial["is_heritage_building"].values[0]
if self.in_conservation_area is True | self.is_listed is True | self.is_heritage is True:
self.restricted_measures = True
def set_year_built(self):
"""
@ -349,17 +391,6 @@ class Property(Definitions):
else:
self.mains_gas = map[self.data["mains-gas-flag"]]
def set_floor_height(self):
"""
Sets the floor height of the property
:return:
"""
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
self.floor_height = None
else:
self.floor_height = float(self.data["floor-height"])
def _clean_upload_data(self, to_update):
for k, v in to_update.items():
if v in self.DATA_ANOMALY_MATCHES:
@ -443,21 +474,210 @@ class Property(Definitions):
return property_details_epc
def set_wall_area(self):
"""
This method is placeholder
It implements our floor area model to produce an estimate of the property's insulatable wall area
"""
import random
self.insulation_wall_area = random.uniform(60, 100)
def set_floor_area(self):
"""
Sets the floor area based on the EPC data
def get_spatial_data(self, uprn_filenames):
"""
# We don't know the number of floors at the moment so we're going to assume 1
# however this is something we'll need to use Verisk data for
Given a property's UPRN, this method will pull the associated spatial data from s3
:return:
"""
if self.uprn is None:
raise ValueError("URPN is not set, run search_address_epc")
# We get the file name for the uprn
filtered_df = uprn_filenames[(uprn_filenames['lower'] <= self.uprn) & (uprn_filenames['upper'] >= self.uprn)]
if filtered_df.empty:
logger.warning("Could not find file containing UPRNS")
return None
filename = filtered_df.iloc[0]['filenames']
spatial_data = read_dataframe_from_s3_parquet(
bucket_name=DATA_BUCKET, file_key=f"spatial/{filename}"
)
spatial = spatial_data[spatial_data["UPRN"] == self.uprn]
# Pull out spatial features
self.set_spatial(spatial)
def _filter_property_dimensions(self, property_dimensions):
"""
Will filter the property dimensions dataframe to only include the relevant rows for the property
:param property_dimensions:
:return: filtered property dimensions dataframe
"""
result = property_dimensions[(property_dimensions["PROPERTY_TYPE"] == self.data["property-type"])]
if self.construction_age_band is not None and self.construction_age_band not in self.DATA_ANOMALY_MATCHES:
result = result[(result["CONSTRUCTION_AGE_BAND"] == self.construction_age_band)]
if self.data["built-form"] not in self.DATA_ANOMALY_MATCHES and self.data["built-form"] in result["BUILT_FORM"]:
result = result[(result["BUILT_FORM"] == self.data["built-form"])]
return result[["NUMBER_HABITABLE_ROOMS", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]].mean()
def set_basic_property_dimensions(self):
"""
This method sets the number of floors of the property, using a simple approach based on an estimate for
average room size, number of rooms and total floor area
It sets the perimeter of the property, using a simple approach based on an estimate for average room size,
number of rooms and total floor area
Also sets floor area, number of rooms, using backup cleaned values if this data is not present, based on
medians across the EPC data
:return:
"""
self.floor_area = float(self.data["total-floor-area"])
if not self.data["number-habitable-rooms"] or (
self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES
):
if self.property_dimensions is None:
property_dimensions = read_dataframe_from_s3_parquet(
bucket_name=DATA_BUCKET, file_key=f"property_dimensions/{self.data['local-authority']}.parquet"
)
self.property_dimensions = self._filter_property_dimensions(property_dimensions)
if not self.data["number-habitable-rooms"]:
self.number_of_rooms = float(self.property_dimensions["NUMBER_HABITABLE_ROOMS"].round())
else:
self.number_of_rooms = float(self.data["number-habitable-rooms"])
if self.data["property-type"] == "House":
self.number_of_floors = estimate_floors(self.floor_area, self.number_of_rooms)
elif self.data["property-type"] == "Flat":
self.number_of_floors = 1
else:
raise NotImplementedError("Implement me")
if self.data["floor-height"] == "" or self.data["floor-height"] in self.DATA_ANOMALY_MATCHES:
self.floor_height = float(self.property_dimensions["FLOOR_HEIGHT"].round(2))
else:
self.floor_height = float(self.data["floor-height"])
self.perimeter = estimate_perimeter(
self.floor_area / self.number_of_floors, self.number_of_rooms / self.number_of_floors
)
self.insulation_wall_area = estimate_wall_area(
num_floors=self.number_of_floors, floor_height=self.floor_height, perimeter=self.perimeter
)
def set_wall_type(self):
"""
This method sets the wall type of the property, using a simple approach based on the wall description
:return:
"""
self.wall_type = get_wall_type(**self.walls)
def set_floor_type(self):
"""
This method sets the floor type of the property, which is used for calculating u-values
:return:
"""
self.floor_type = "suspended" if self.floor["is_suspended"] else "solid"
@staticmethod
def _extract_component(component_data, component_rename_cols, component_drop_cols, rename_prefix=None):
for k in component_rename_cols:
component_data[f"{rename_prefix}_{k}"] = component_data[k]
component_data = {
k: v for k, v in component_data.items() if k not in component_drop_cols + component_rename_cols
}
return component_data
def get_model_data(self):
"""
This method extracts cleaned data from the property object, which is used in our machine learning models
This will use many of the cleaned properties, extracted from the epc data, or methods in DataProcessor.
For future iterations of this, we probably want to implement a singular method in DataProcessor, which can
be used in the etl code and in here
:return: dictionary of model data to be scored in the model
"""
drop_cols = ["original_description", "clean_description"]
insulation_drop_cols = ["thermal_transmittance_unit", "is_assumed", "is_valid"]
insulation_rename_cols = ["thermal_transmittance", "insulation_thickness"]
walls = self._extract_component(self.walls, insulation_rename_cols, insulation_drop_cols + drop_cols, "walls")
roof = self._extract_component(self.roof, insulation_rename_cols, insulation_drop_cols + drop_cols, "roof")
floor = self._extract_component(self.floor, insulation_rename_cols, insulation_drop_cols + drop_cols, "floor")
windows = self._extract_component(self.windows, [], drop_cols + ["no_data"])
fuel = self._extract_component(self.main_fuel, ["tariff_type"], drop_cols + ["tariff_type"], "main-fuel")
main_heating = self._extract_component(self.main_heating, [], drop_cols + ["has_assumed"])
main_heating_controls = self._extract_component(self.main_heating_controls, [], drop_cols)
hotwater = self._extract_component(self.hotwater, ["tariff_type"], drop_cols + ['assumed'], "hotwater")
# We'll need to clean second heating
second_heating = self.data["secondheat-description"]
epc_raw_columns = [
'TRANSACTION_TYPE',
'ENERGY_TARIFF',
'PROPERTY_TYPE',
'UPRN',
'NUMBER_OPEN_FIREPLACES',
'FIXED_LIGHTING_OUTLETS_COUNT',
'MULTI_GLAZE_PROPORTION',
'MECHANICAL_VENTILATION',
'PHOTO_SUPPLY',
'LOW_ENERGY_LIGHTING',
'SOLAR_WATER_HEATING_FLAG',
'GLAZED_TYPE',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
'EXTENSION_COUNT',
]
epc_raw_data = {
k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
}
built_form_cleaning_map = {
"Flat": "Mid-Terrace",
"House": "Semi-Detached",
"Bungalow": "Detached",
"Maisonette": "Mid-Terrace"
}
built_form = self.data["built-form"]
if built_form in self.DATA_ANOMALY_MATCHES:
# TODO: If built form isn't captured, we use the most common value for that property type - we shall
# improve this methodology
built_form = built_form_cleaning_map.get(self.data["property-type"])
if not built_form:
raise NotImplementedError("Not handled this property type when cleaning built form")
property_data = {
**walls,
**roof,
**floor,
**fuel,
**main_heating,
**main_heating_controls,
**hotwater,
**windows,
"SECONDHEAT_DESCRIPTION": second_heating,
"DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
"SAP": float(self.data["current-energy-efficiency"]),
"CARBON": float(self.data["co2-emissions-current"]),
"HEAT_DEMAND": float(self.data["energy-consumption-current"]),
"estimated_perimeter": self.perimeter,
"CONSTRUCTION_AGE_BAND": self.construction_age_band,
"FLOOR_HEIGHT": self.floor_height,
"NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
"TOTAL_FLOOR_AREA": self.floor_area,
**epc_raw_data,
"BUILT_FORM": built_form,
}
return property_data

View file

@ -1,10 +1,17 @@
from backend.app.db.models.materials import Material
from functools import lru_cache
@lru_cache(maxsize=128)
def get_materials(session):
"""
This function will retrieve all materials from the database.
:return: A list of Material objects if successful, an empty list otherwise.
TODO: It might not be the best choice to store the materials data in a database table since thi
table probably won't be very large and won't be updated that often. It might be better to
store this data in s3 load it into memory when the app starts up. We will test this
"""
materials = session.query(Material).filter(Material.is_active).all()

View file

@ -12,6 +12,7 @@ class MaterialType(enum.Enum):
solid_floor_insulation = "solid_floor_insulation"
external_wall_insulation = "external_wall_insulation"
internal_wall_insulation = "internal_wall_insulation"
cavity_wall_insulation = "cavity_wall_insulation"
class DepthUnit(enum.Enum):

View file

@ -1,50 +1,41 @@
from collections import defaultdict
from fastapi import APIRouter, Depends
from backend.app.db.models.portfolio import rating_lookup
from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.utils import read_csv_from_s3
from backend.app.config import get_settings
from backend.Property import Property
from epc_api.client import EpcClient
from utils.logger import setup_logger
from utils.s3 import read_from_s3
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.WallRecommendations import WallRecommendations
from recommendations.config import UPGRADES_MAP
from utils.uvalue_estimates import classify_decile_newvalues
from backend.app.db.utils import row2dict
from starlette.responses import Response
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError, OperationalError
from datetime import datetime
import pandas as pd
import msgpack
from epc_api.client import EpcClient
from fastapi import APIRouter, Depends
from sqlalchemy.exc import IntegrityError, OperationalError
from sqlalchemy.orm import sessionmaker
from starlette.responses import Response
# model apis
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
# database interaction functions
from backend.app.db.functions.property_functions import (
create_property, create_property_targets, update_property_data, create_property_details_epc
)
from backend.app.config import get_settings
from backend.app.db.connection import db_engine
from backend.app.db.functions.materials_functions import get_materials
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
from backend.app.db.functions.property_functions import (
create_property, create_property_details_epc, create_property_targets, update_property_data
)
from backend.app.db.functions.recommendations_functions import (
create_plan, create_plan_recommendations, upload_recommendations
)
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
from backend.app.db.connection import db_engine
from backend.app.db.models.portfolio import rating_lookup
from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.plan.utils import (
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
)
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
from model_data.optimiser.GainOptimiser import GainOptimiser
from model_data.optimiser.CostOptimiser import CostOptimiser
from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
from model_data.optimiser.optimiser_functions import prepare_input_measures
from model_data.simulation_system.core.DataProcessor import DataProcessor
from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
# TODO: This is placeholder until data is stored in DB
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
from backend.Property import Property
from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.optimiser.CostOptimiser import CostOptimiser
from recommendations.optimiser.GainOptimiser import GainOptimiser
from recommendations.optimiser.optimiser_functions import prepare_input_measures
from recommendations.WallRecommendations import WallRecommendations
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
logger = setup_logger()
@ -55,147 +46,25 @@ router = APIRouter(
responses={404: {"description": "Not found"}}
)
# TODO: Load this data from db
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
in_conservation_area_data = [
{'uprn': 6032920, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 6038625, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 34153991, 'is_in_conservation_area': 'unknown'},
{'uprn': 10008299676, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 10008299677, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 100021039066, 'is_in_conservation_area': 'not_in_conservation_area'},
{'uprn': 100021226060, 'is_in_conservation_area': 'in_conservation_area'},
{'uprn': 200003489276, 'is_in_conservation_area': 'in_conservation_area'}
]
# TODO: db
floors_decile_data = {
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 50., 56., 69., 77.6, 87., 98., 112.,
127., 150., 2279.]}
walls_decile_data = {
'decile_labels': ['Decile 1', 'Decile 2', 'Decile 3', 'Decile 4', 'Decile 5', 'Decile 6', 'Decile 7', 'Decile 8',
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
120., 2279.]}
def filter_materials(materials):
materials_by_type = defaultdict(list)
for material in materials:
material = row2dict(material)
material_type = material["type"]
materials_by_type[material_type].append(material)
# Optionally, you can convert the defaultdict to a normal dict if desired
materials_by_type = dict(materials_by_type)
return materials_by_type
def insert_temp_recommendation_id(property_recommendations):
"""
Creates a temporary recommendation id which is needed for
filtering recommendations between default and no, after the optimiser has been
run
:param property_recommendations: nested list of recommendations, grouped by data_types
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
integer inserted
"""
idx = 0
for recs in property_recommendations:
for rec in recs:
rec["recommendation_id"] = idx
idx += 1
return property_recommendations
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def create_recommendation_scoring_data(
property: Property,
recommendation: dict,
starting_epc_data: pd.DataFrame,
ending_epc_data: pd.DataFrame,
fixed_data: pd.DataFrame,
):
"""
This wrapper function prepares data to be passed to the sap model api
:return:
"""
scoring_dict = {
"UPRN": property.data["uprn"],
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
"LOCAL_AUTHORITY": property.data["local-authority"],
**starting_epc_data.to_dict("records")[0],
**ending_epc_data.to_dict("records")[0],
**fixed_data.to_dict("records")[0]
}
# We update the description to indicate it's insulated
if recommendation["type"] == "wall_insulation":
scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
elif recommendation["type"] == "floor_insulation":
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
else:
raise NotImplementedError("Implement me")
return scoring_dict
@router.post("/trigger")
async def trigger_plan(body: PlanTriggerRequest):
logger.info("Connecting to db")
session = sessionmaker(bind=db_engine)()
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
created_at = datetime.now().isoformat()
try:
session.begin()
logger.info("Getting the inputs")
# Read in the trigger file from s3
bucket_name = get_settings().PLAN_TRIGGER_BUCKET
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
uprn_filenames = read_dataframe_from_s3_parquet(
bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
)
cleaning_data = read_parquet_from_s3(
bucket_name=get_settings().DATA_BUCKET, file_key="sap_change_model/cleaning_dataset.parquet",
)
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
input_properties = []
for config in plan_input:
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
@ -228,32 +97,21 @@ async def trigger_plan(body: PlanTriggerRequest):
if not input_properties:
return Response(status_code=204)
logger.info("Getting EPC, coordinates and conservation area data")
logger.info("Getting EPC, and spatial data")
for p in input_properties:
p.search_address_epc()
p.set_year_built()
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
p.set_coordinates(coordinate_data)
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
"is_in_conservation_area"
)
p.set_is_in_conservation_area(in_conservation_area)
p.get_spatial_data(uprn_filenames)
# The materials data could be cached or local so we don't need to make
# consistent requests to the backend for
# the same data
# TODO: It might not be the best choice to store the materials data in a database table since thi
# table probably won't be very large and won't be updated that often. It might be better to
# store this data in s3 load it into memory when the app starts up. We will test this
logger.info("Reading in materials and cleaned datasets")
materials = get_materials(session)
materials_by_type = filter_materials(materials)
cleaned = get_cleaned()
logger.info("Getting components and properties recommendations")
logger.info("Getting components and epc recommendations")
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
@ -263,34 +121,13 @@ async def trigger_plan(body: PlanTriggerRequest):
for p in input_properties:
property_recommendations = []
# For each property, classiy floor area decide
total_floor_area_group_decile = classify_decile_newvalues(
decile_boundaries=floors_decile_data["decile_boundaries"],
decile_labels=floors_decile_data["decile_labels"],
new_values=[float(p.data["total-floor-area"])],
)[0]
# Property recommendations
p.get_components(cleaned)
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
# database
floors_u_value_estimate = [
x for x in uvalue_estimates_floors
if (x['local-authority'] == p.data["local-authority"]) &
(x['property-type'] == p.data["property-type"]) &
(x['built-form'] == p.data["built-form"]) &
(x['floor-energy-eff'] == p.data["floor-energy-eff"] if p.data[
"floor-energy-eff"] != 'N/A' else True) &
(x['floor-env-eff'] == p.data["floor-env-eff"] if p.data["floor-env-eff"] != 'N/A' else True)
]
# Floor recommendations
floor_recommender = FloorRecommendations(
property_instance=p,
uvalue_estimates=floors_u_value_estimate,
total_floor_area_group_decile=total_floor_area_group_decile,
materials=materials_by_type["suspended_floor_insulation"] + materials_by_type["solid_floor_insulation"],
materials=materials_by_type["floor"],
)
floor_recommender.recommend()
@ -298,30 +135,10 @@ async def trigger_plan(body: PlanTriggerRequest):
property_recommendations.append(floor_recommender.recommendations)
# Wall recommendations
# We would make this u-value query directly to the database
total_floor_area_group_decile = classify_decile_newvalues(
decile_boundaries=walls_decile_data["decile_boundaries"],
decile_labels=walls_decile_data["decile_labels"],
new_values=[float(p.data["total-floor-area"])],
)[0]
# This is placeholder, until the full dataset is loaded into the database and we just make a read to the
# database
walls_u_value_estimate = [
x for x in uvalue_estimates_walls
if (x['local-authority'] == p.data["local-authority"]) &
(x['property-type'] == p.data["property-type"]) &
(x['built-form'] == p.data["built-form"]) &
(x['walls-energy-eff'] == p.data["walls-energy-eff"] if p.data[
"walls-energy-eff"] != 'N/A' else True) &
(x['walls-env-eff'] == p.data["walls-env-eff"] if p.data["walls-env-eff"] != 'N/A' else True)
]
wall_recomender = WallRecommendations(
property_instance=p,
uvalue_estimates=walls_u_value_estimate,
total_floor_area_group_decile=total_floor_area_group_decile,
materials=materials_by_type["external_wall_insulation"] + materials_by_type["internal_wall_insulation"]
materials=materials_by_type["walls"]
)
wall_recomender.recommend()
@ -337,12 +154,8 @@ async def trigger_plan(body: PlanTriggerRequest):
recommendations[p.id] = property_recommendations
# Finally, we'll prepare data for predicting the impact on SAP
# TODO: We should use the cleaned data from get_components in the data rather than the raw
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
# data
data_processor = DataProcessor(None, newdata=True)
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
data_processor.pre_process()
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
@ -350,10 +163,10 @@ async def trigger_plan(body: PlanTriggerRequest):
fixed_data = data_processor.get_fixed_features()
# We update the ending record with the recommended updates and we set lodgement date to today
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
for recommendations_by_type in property_recommendations:
for rec in recommendations_by_type:
for i, rec in enumerate(recommendations_by_type):
scoring_dict = create_recommendation_scoring_data(
property=p,
recommendation=rec,
@ -370,15 +183,6 @@ async def trigger_plan(body: PlanTriggerRequest):
logger.info("Preparing data for scoring in sap change api")
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
# Clean the data
logger.info("Reading in cleaning dataset from s3")
cleaning_data = read_parquet_from_s3(
bucket_name=get_settings().DATA_BUCKET,
file_key="sap_change_model/cleaning_dataset.parquet",
).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
# Merge the cleaning data onto recommendations_scoring_data
# Perform the same cleaning as in the model
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
data_to_clean=recommendations_scoring_data,
@ -386,6 +190,13 @@ async def trigger_plan(body: PlanTriggerRequest):
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
).drop(columns=["LOCAL_AUTHORITY"])
recommendations_scoring_data = DataProcessor.clean_missings_after_description_process(
recommendations_scoring_data, [
c for c in recommendations_scoring_data.columns if
("thermal_transmittance" in c) or ("insulation_thickness" in c)
]
)
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
file_location = sap_change_model_api.upload_scoring_data(
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
@ -396,14 +207,17 @@ async def trigger_plan(body: PlanTriggerRequest):
# Retrieve the predictions
predictions = pd.DataFrame(
read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
read_parquet_from_s3(
bucket_name=get_settings().PREDICTIONS_BUCKET,
file_key=response["storage_filepath"].split(get_settings().PREDICTIONS_BUCKET + "/")[1]
)
)
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
predictions["predictions"] = predictions["predictions"].astype(float).round(1)
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
# Insert the predictions into the recommendations and run the optimiser
logger.info("Storing recommendations")
logger.info("Optimising recommendations")
for property_id in recommendations.keys():
property = [p for p in input_properties if p.id == property_id][0]
@ -411,9 +225,11 @@ async def trigger_plan(body: PlanTriggerRequest):
for recommendations_by_type in recommendations[property_id]:
for rec in recommendations_by_type:
rec["sap_points"] = property_predictions[property_predictions["recommendation_id"] == str(
new_sap = property_predictions[property_predictions["recommendation_id"] == str(
rec["recommendation_id"]
)]["RDSAP_CHANGE"].values[0]
)]["predictions"].values[0]
rec["sap_points"] = new_sap - float(property.data["current-energy-efficiency"])
if rec["sap_points"] is None:
raise ValueError("Sap points missing")
@ -451,8 +267,6 @@ async def trigger_plan(body: PlanTriggerRequest):
final_recommendations = [
rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
]
# We update recommendations[property_id]
recommendations[property_id] = final_recommendations
# 1) the property data

View file

@ -0,0 +1,176 @@
from datetime import datetime
import pandas as pd
from epc_api.client import EpcClient
from fastapi import APIRouter, Depends
from sqlalchemy.exc import IntegrityError, OperationalError
from sqlalchemy.orm import sessionmaker
from starlette.responses import Response
from backend.app.config import get_settings
from backend.app.db.connection import db_engine
from backend.app.db.functions.materials_functions import get_materials
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
from backend.app.db.functions.property_functions import (
create_property, create_property_details_epc, create_property_targets, update_property_data
)
from backend.app.db.functions.recommendations_functions import (
create_plan, create_plan_recommendations, upload_recommendations
)
from backend.app.db.models.portfolio import rating_lookup
from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.plan.utils import (
create_recommendation_scoring_data, filter_materials, get_cleaned, insert_temp_recommendation_id
)
from backend.app.utils import epc_to_sap_lower_bound, read_csv_from_s3, read_parquet_from_s3
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
from backend.Property import Property
from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.optimiser.CostOptimiser import CostOptimiser
from recommendations.optimiser.GainOptimiser import GainOptimiser
from recommendations.optimiser.optimiser_functions import prepare_input_measures
from recommendations.WallRecommendations import WallRecommendations
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
logger = setup_logger()
import pickle
with open('local_data.pickle', 'rb') as f:
local_data = pickle.load(f)
with open("property_dimensions.pickle", "rb") as f:
property_dimensions = pickle.load(f)
with open("sap_change_dataset.pickle", "rb") as f:
sap_change_dataset = pickle.load(f)
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
plan_input = local_data["plan_input"]
uprn_filenames = local_data["uprn_filenames"]
local_property_data = local_data["local_property_data"]
materials = local_data["materials"]
materials_by_type = filter_materials(materials)
cleaned = local_data["cleaned"]
cleaning_data = local_data["cleaning_data"]
# Need to find some proper materials
materials_by_type["walls"] += [
{'id': 4, 'type': 'cavity_wall_insulation', 'description': 'Example Material 1',
'depths': None,
'depth_unit': None, 'cost': 20,
'cost_unit': 'gbp_sq_meter', 'r_value_per_mm': 0.0278, 'r_value_unit': 'square_meter_kelvin_per_watt',
'thermal_conductivity': 0.036, 'thermal_conductivity_unit': 'watt_per_meter_kelvin',
'link': None, 'created_at': None, 'is_active': True},
{'id': 10, 'type': "cavity_wall_insulation", 'description': 'Example Material 2',
'depths': None, 'depth_unit': None, 'cost': 25, 'cost_unit': 'gbp_sq_meter',
'r_value_per_mm': 0.02631579, 'r_value_unit': 'square_meter_kelvin_per_watt', 'thermal_conductivity': 0.038,
'thermal_conductivity_unit': 'watt_per_meter_kelvin',
'link': None,
'created_at': None, 'is_active': True}
]
epc_client = EpcClient(auth_token="NO-TOKEN")
input_properties = []
for i, config in enumerate(plan_input):
property_id = local_property_data[i]["id"]
input_properties.append(
Property(
postcode=config['postcode'],
address1=config['address'],
epc_client=epc_client,
id=property_id
)
)
logger.info("Getting EPC, and spatial data")
for i, p in enumerate(input_properties):
p.data = local_property_data[i]["data"]
p.uprn = local_property_data[i]["uprn"]
p.id = local_property_data[i]["id"]
p.full_sap_epc = local_property_data[i]["full_sap_epc"]
p.old_data = local_property_data[i]["old_data"]
p.is_listed = False
p.in_conservation_area = False
p.is_heritage = False
p.set_year_built()
# TODO: TESTING
p.data['number-habitable-rooms'] = 3
recommendations = {}
recommendations_scoring_data = []
for p in input_properties:
property_recommendations = []
# Property recommendations
p.get_components(cleaned)
# Floor recommendations
floor_recommender = FloorRecommendations(
property_instance=p,
materials=materials_by_type["floor"],
)
floor_recommender.recommend()
if floor_recommender.recommendations:
property_recommendations.append(floor_recommender.recommendations)
# Wall recommendations
wall_recomender = WallRecommendations(
property_instance=p,
materials=materials_by_type["walls"]
)
wall_recomender.recommend()
if wall_recomender.recommendations:
property_recommendations.append(wall_recomender.recommendations)
# We insert temporary ids into the recommendations which is important for the optimiser later
property_recommendations = insert_temp_recommendation_id(property_recommendations)
if not property_recommendations:
continue
recommendations[p.id] = property_recommendations
# Finally, we'll prepare data for predicting the impact on SAP
# TODO: We should use the cleaned data from get_components in the data rather than the raw
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
# data
data_processor = DataProcessor(None, newdata=True)
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
data_processor.pre_process()
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
fixed_data = data_processor.get_fixed_features()
# We update the ending record with the recommended updates and we set lodgement date to today
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
for recommendations_by_type in property_recommendations:
for rec in recommendations_by_type:
scoring_dict = create_recommendation_scoring_data(
property=p,
recommendation=rec,
starting_epc_data=starting_epc_data,
ending_epc_data=ending_epc_data,
fixed_data=fixed_data,
)
recommendations_scoring_data.append(scoring_dict)
# cleanup
del data_processor

187
backend/app/plan/utils.py Normal file
View file

@ -0,0 +1,187 @@
import pandas as pd
from backend.Property import Property
from collections import defaultdict
from utils.s3 import read_from_s3
from recommendations.config import UPGRADES_MAP
from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
from backend.app.db.utils import row2dict
from backend.app.config import get_settings
import msgpack
def filter_materials(materials):
materials_by_type = defaultdict(list)
mapping = {
"walls": ["internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"],
"floor": ["suspended_floor_insulation", "solid_floor_insulation"]
}
materials = [row2dict(material) for material in materials]
for component, types in mapping.items():
materials_by_type[component] = [part for part in materials if part["type"] in types]
return dict(materials_by_type)
def insert_temp_recommendation_id(property_recommendations):
"""
Creates a temporary recommendation id which is needed for
filtering recommendations between default and no, after the optimiser has been
run
:param property_recommendations: nested list of recommendations, grouped by data_types
:return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id"
integer inserted
"""
idx = 0
for recs in property_recommendations:
for rec in recs:
rec["recommendation_id"] = idx
idx += 1
return property_recommendations
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def create_recommendation_scoring_data(
property: Property,
recommendation: dict,
starting_epc_data: pd.DataFrame,
ending_epc_data: pd.DataFrame,
fixed_data: pd.DataFrame,
):
"""
This wrapper function prepares data to be passed to the sap model api
:return:
"""
scoring_dict = {
"UPRN": property.data["uprn"],
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
"LOCAL_AUTHORITY": property.data["local-authority"],
**starting_epc_data.to_dict("records")[0],
**ending_epc_data.to_dict("records")[0],
**fixed_data.to_dict("records")[0]
}
# Set staring u-values if we don't have them
if not scoring_dict["walls_thermal_transmittance"]:
scoring_dict["walls_thermal_transmittance"] = get_wall_u_value(
clean_description=property.walls["clean_description"],
age_band=property.age_band,
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
)
if not scoring_dict["floor_thermal_transmittance"]:
scoring_dict["floor_thermal_transmittance"] = get_floor_u_value(
floor_type=property.floor_type,
area=property.floor_area,
perimeter=property.perimeter,
wall_type=property.wall_type,
insulation_thickness=property.floor["insulation_thickness"],
age_band=property.age_band,
)
if not scoring_dict["roof_thermal_transmittance"]:
scoring_dict["roof_thermal_transmittance"] = get_roof_u_value(
insulation_thickness=property.roof["insulation_thickness"],
has_dwelling_above=property.roof["has_dwelling_above"],
is_loft=property.roof["is_loft"],
is_roof_room=property.roof["is_roof_room"],
is_thatched=property.roof["is_thatched"],
age_band=property.age_band,
is_flat=property.roof["is_flat"],
is_pitched=property.roof["is_pitched"],
is_at_rafters=property.roof["is_at_rafters"],
)
for col in [
"walls_insulation_thickness", "floor_insulation_thickness", "roof_insulation_thickness"
]:
if scoring_dict[col] is None:
scoring_dict[col] = "none"
# We update the description to indicate it's insulated
if recommendation["type"] == "wall_insulation":
# The upgrade made here is to the u-value of the walls and the description of the
# insulation thickness
scoring_dict["walls_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
scoring_dict["walls_insulation_thickness_ENDING"] = "above average"
else:
if not scoring_dict["walls_thermal_transmittance_ENDING"]:
scoring_dict["walls_thermal_transmittance_ENDING"] = get_wall_u_value(
clean_description=property.walls["clean_description"],
age_band=property.age_band,
is_granite_or_whinstone=property.walls["is_granite_or_whinstone"],
is_sandstone_or_limestone=property.walls["is_sandstone_or_limestone"]
)
if scoring_dict["walls_insulation_thickness_ENDING"] is None:
scoring_dict["walls_insulation_thickness_ENDING"] = "none"
# Update description to indicate it's insulate
if recommendation["type"] == "floor_insulation":
if len(recommendation["parts"]) > 1:
raise NotImplementedError("Have more than 1 floor insulation part - handle this case")
scoring_dict["floor_thermal_transmittance_ENDING"] = recommendation["new_u_value"]
# We don't really see above average for this in the training data
scoring_dict["floor_insulation_thickness_ENDING"] = "average"
else:
if not scoring_dict["floor_thermal_transmittance_ENDING"]:
scoring_dict["floor_thermal_transmittance_ENDING"] = get_floor_u_value(
floor_type=property.floor_type,
area=property.floor_area,
perimeter=property.perimeter,
wall_type=property.wall_type,
insulation_thickness=property.floor["insulation_thickness"],
age_band=property.age_band,
)
if scoring_dict["floor_insulation_thickness_ENDING"] is None:
scoring_dict["floor_insulation_thickness_ENDING"] = "none"
if recommendation["type"] not in ["wall_insulation", "floor_insulation"]:
raise NotImplementedError("Implement me")
if not scoring_dict["roof_thermal_transmittance_ENDING"]:
scoring_dict["roof_thermal_transmittance_ENDING"] = get_roof_u_value(
insulation_thickness=property.roof["insulation_thickness"],
has_dwelling_above=property.roof["has_dwelling_above"],
is_loft=property.roof["is_loft"],
is_roof_room=property.roof["is_roof_room"],
is_thatched=property.roof["is_thatched"],
age_band=property.age_band,
is_flat=property.roof["is_flat"],
is_pitched=property.roof["is_pitched"],
is_at_rafters=property.roof["is_at_rafters"],
)
if scoring_dict["roof_insulation_thickness_ENDING"] is None:
scoring_dict["roof_insulation_thickness_ENDING"] = "none"
return scoring_dict

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -79,17 +79,17 @@ def sap_to_epc(sap_points: int):
if sap_points <= 0 or sap_points > 100:
raise ValueError("SAP points should be between 1 and 100.")
if sap_points > 91:
if sap_points >= 92:
return "A"
elif sap_points > 80:
elif sap_points >= 81:
return "B"
elif sap_points > 69:
elif sap_points >= 69:
return "C"
elif sap_points > 55:
elif sap_points >= 55:
return "D"
elif sap_points > 39:
elif sap_points >= 39:
return "E"
elif sap_points > 21:
elif sap_points >= 21:
return "F"
else:
return "G"
@ -108,13 +108,13 @@ def epc_to_sap_lower_bound(epc: str):
elif epc == "B":
return 81
elif epc == "C":
return 70
return 69
elif epc == "D":
return 56
return 55
elif epc == "E":
return 40
return 39
elif epc == "F":
return 22
return 21
elif epc == "G":
return 1
else:

View file

@ -62,14 +62,14 @@ class SAPChangeModelAPI:
logger.info("Making request to sap change api")
url = f"{self.base_url}/sapmodel/predict"
payload = {
"file_location": f"s3://retrofit-data-dev/{file_location}",
"file_location": file_location,
"property_id": "", # This should get removed
"portfolio_id": self.portfolio_id,
"created_at": self.timestamp
}
try:
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"})
response = requests.post(url, json=payload, headers={"Content-Type": "application/json"}, timeout=120)
# Check if the response status code is 2xx (success)
response.raise_for_status()

View file

@ -34,4 +34,5 @@ pytz==2023.3
mip==1.15.0
boto3==1.28.3
pandas==1.5.3
pyarrow==12.0.1
pyarrow==12.0.1
textblob

View file

@ -1,15 +1,17 @@
import pytest
import pandas as pd
from unittest.mock import Mock
from epc_api.client import EpcClient
from backend.Property import Property
from open_uprn.OpenUprnClient import OpenUprnClient
from model_data.EpcClean import EpcClean
from etl.epc_clean.EpcClean import EpcClean
# Define some test data
mock_epc_response = {
"rows": [
{
"lmk-key": 1,
"uprn": 1,
"number-habitable-rooms": 5,
"property-type": "House",
"inspection-date": "2023-06-01",
"some-other-key": "some-value",
"roof-description": "Roof Description",
@ -34,6 +36,10 @@ mock_epc_response = {
"construction-age-band": "England and Wales: 1967-1975"
},
{
"lmk-key": 2,
"uprn": 2,
"number-habitable-rooms": 5,
"property-type": "House",
"inspection-date": "2023-05-01",
"some-other-key": "some-other-value",
"roof-description": "Roof Description",
@ -63,6 +69,10 @@ mock_epc_response = {
mock_epc_response_dupe = {
'rows': [
{
"lmk-key": 1,
"uprn": 1,
"number-habitable-rooms": 5,
"property-type": "House",
'inspection-date': '2023-06-01', 'some-other-key': 'some-value', 'roof-description': 'Roof Description',
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
@ -83,6 +93,10 @@ mock_epc_response_dupe = {
"construction-age-band": "England and Wales: 1967-1975"
},
{
"lmk-key": 2,
"uprn": 2,
"number-habitable-rooms": 5,
"property-type": "House",
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
'roof-description': 'Roof Description', 'walls-description': 'Walls Description',
'windows-description': 'Windows Description', 'mainheat-description': 'Main Heating Description',
@ -104,6 +118,10 @@ mock_epc_response_dupe = {
"construction-age-band": "England and Wales: 1967-1975"
},
{
"lmk-key": 3,
"uprn": 3,
"number-habitable-rooms": 5,
"property-type": "House",
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
'roof-description': 'Roof Description',
'walls-description': 'Walls Description', 'windows-description': 'Windows Description',
@ -130,7 +148,7 @@ mock_epc_response_dupe = {
class TestProperty:
@pytest.fixture(autouse=True)
def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
def property_instance(self, mock_epc_client, mock_cleaner):
property_instance = Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
return property_instance
@ -141,29 +159,18 @@ class TestProperty:
@pytest.fixture
def mock_epc_client(self):
mock_epc_client = Mock(spec=EpcClient())
mock_epc_client = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
mock_epc_client.domestic.search.return_value = mock_epc_response.copy()
mock_epc_client.auth_token = "mocked_auth_token"
return mock_epc_client
@pytest.fixture
def mock_epc_client_dupe_data(self):
mock_epc_client_dupe_data = Mock(spec=EpcClient())
mock_epc_client_dupe_data = Mock(spec=EpcClient(auth_token="mocked_auth_token"))
mock_epc_client_dupe_data.domestic.search.return_value = mock_epc_response_dupe.copy()
mock_epc_client_dupe_data.auth_token = "mocked_auth_token"
return mock_epc_client_dupe_data
@pytest.fixture
def mock_open_uprn_client(self):
mock_open_uprn_client = Mock(spec=OpenUprnClient(path=None, uprns=[12345]))
mock_open_uprn_client.data = pd.DataFrame(
[
{"UPRN": 12345, "longitude": 1.2345, "latitude": 2.3456},
{"UPRN": 12346, "longitude": 3.4567, "latitude": 4.5678}
]
)
return mock_open_uprn_client
@pytest.fixture
def mock_cleaner(self):
lighting_averages = [
@ -186,9 +193,22 @@ class TestProperty:
)
mock_cleaner = Mock(spec=cleaner_spec)
walls_data = {
"original_description": "Walls Description",
"is_cavity_wall": True,
"is_solid_brick": False,
"is_timber_frame": False,
"is_system_built": False,
"is_park_home": False,
"is_cob": False,
"is_sandstone_or_limestone": False,
"is_granite_or_whinstone": False,
}
mock_cleaner.cleaned = {
"roof-description": [{"original_description": "Roof Description"}],
"walls-description": [{"original_description": "Walls Description"}],
"walls-description": [walls_data],
"windows-description": [{"original_description": "Windows Description"}],
"mainheat-description": [{"original_description": "Main Heating Description"}],
"hotwater-description": [{"original_description": "Hot Water Description"}],
@ -201,10 +221,10 @@ class TestProperty:
# Should be mocked auth token
assert inst1.epc_client.auth_token == "mocked_auth_token"
inst2 = Property(3, "AB12CD", "Test Address")
inst2 = Property(3, "AB12CD", "Test Address", epc_client=mock_epc_client)
assert inst2.epc_client.auth_token
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"}, epc_client=mock_epc_client)
assert inst3.data == {"some": "data"}
data = inst3.search_address_epc()
@ -227,11 +247,23 @@ class TestProperty:
# Verify that the components are set correctly
assert property_instance.roof == {"original_description": "Roof Description"}
assert property_instance.walls == {"original_description": "Walls Description"}
assert property_instance.walls == {
"original_description": "Walls Description",
"is_cavity_wall": True,
"is_solid_brick": False,
"is_timber_frame": False,
"is_system_built": False,
"is_park_home": False,
"is_cob": False,
"is_sandstone_or_limestone": False,
"is_granite_or_whinstone": False,
}
assert property_instance.windows == {"original_description": "Windows Description"}
assert property_instance.main_heating == {"original_description": "Main Heating Description"}
assert property_instance.hotwater == {"original_description": "Hot Water Description"}
assert property_instance.wall_type == "cavity"
def test_get_components_without_cleaned_data(self, property_instance, mock_cleaner):
# Modify the mock EpcClean to not have cleaned data
mock_cleaner.cleaned = {}

View file

@ -1,51 +0,0 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from conservation_areas.ConservationAreaClient import ConservationAreaClient
from datatypes.datatypes import OpenUprnCoordinateData
def app():
conservation_area_client = ConservationAreaClient(
historic_england_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp",
gov_path=os.path.abspath(
os.path.dirname(__file__)
) + "/model_data/local_data/gov-conservation-area.geojson"
)
conservation_area_client.read()
# We need to iterate through the open uprn data and check if the coordinates are in a conservation area
open_uprn_data = [
{'UPRN': 6032920, 'X_COORDINATE': 535110.0, 'Y_COORDINATE': 181819.0, 'LATITUDE': 51.5191407,
'LONGITUDE': -0.0540506},
{'UPRN': 6038625, 'X_COORDINATE': 535374.0, 'Y_COORDINATE': 182784.0, 'LATITUDE': 51.5277492,
'LONGITUDE': -0.0498772},
{'UPRN': 34153991, 'X_COORDINATE': 523238.74, 'Y_COORDINATE': 178003.02, 'LATITUDE': 51.4875579,
'LONGITUDE': -0.226392},
{'UPRN': 10008299676, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 10008299677, 'X_COORDINATE': 533285.0, 'Y_COORDINATE': 184711.0, 'LATITUDE': 51.5455629,
'LONGITUDE': -0.0792445},
{'UPRN': 100021039066, 'X_COORDINATE': 535506.0, 'Y_COORDINATE': 185624.0, 'LATITUDE': 51.5532385,
'LONGITUDE': -0.0468833},
{'UPRN': 100021226060, 'X_COORDINATE': 529247.0, 'Y_COORDINATE': 187959.0, 'LATITUDE': 51.5756908,
'LONGITUDE': -0.1362513},
{'UPRN': 200003489276, 'X_COORDINATE': 533210.0, 'Y_COORDINATE': 179442.0, 'LATITUDE': 51.4982309,
'LONGITUDE': -0.0823165}
]
result = [
{
"uprn": coordinates["UPRN"],
"is_in_conservation_area": conservation_area_client.is_in_conservation_area(
OpenUprnCoordinateData(**coordinates))
} for coordinates in
open_uprn_data
]
# TODO: Add a method to write to the database

View file

@ -1,5 +0,0 @@
# Data Collection
This service is specifically focused on the collection of data external sources which aren't easily
accessed via api or via downloadable data sources. For example, wages data requires a specific application to
pull that data from websites, e.g. from Adzuna's api

View file

@ -1,86 +0,0 @@
import requests
import json
from data_collection.config import ADZUNA_API_KEY, ADZUNA_APP_ID
import pandas as pd
import os
import time
from tqdm import tqdm
"""
Table of constituencies and their codes can be downloaded from the Office of National Statistics, found here:
https://geoportal.statistics.gov.uk/datasets/ons::westminster-parliamentary-constituencies-december-2022-names-and
-codes-in-the-united-kingdom/explore
"""
constituencies = pd.read_csv(
os.path.abspath(
os.path.dirname(
__file__)) + "/data_collection/data/Westminster_Parliamentary_Constituencies_("
"December_2022)_Names_and_Codes_in_the_United_Kingdom.csv"
)
constituencies["location_type"] = "constituency"
def retry_api_call(job_title, location, max_retries=10):
for i in range(max_retries):
try:
response = get_adzuna_jobs(job_title, location)
return response
except (requests.HTTPError, requests.ConnectionError):
print(f"Attempt {i + 1} failed. Retrying in 2 seconds...")
time.sleep(2)
print(f"Failed after {max_retries} attempts.")
return None
def get_adzuna_jobs(job_title, location):
base_url = "https://api.adzuna.com/v1/api/jobs"
country_code = "gb"
url = f"{base_url}/{country_code}/search/1"
params = {
"app_id": ADZUNA_APP_ID,
"app_key": ADZUNA_API_KEY,
"results_per_page": 25,
"what": job_title,
"where": location,
"content-type": "application/json",
"distance": 10
}
response = requests.get(url, params=params)
response.raise_for_status()
jobs = json.loads(response.text)
return jobs
JOB_TITLES = [
"insulation installer", "internal wall insulation installer", "external wall insulation installer",
"cavity wall insulation installer", "loft insulation installer", "roof insulation installer",
"spray foam insulation installer", "insulation technician", "insulation engineer", "iwi insulation installer",
"iwi installer", "ewi insulation installer", "ewi installer", "cwi insulation installer", "cwi installer",
]
results = []
for i, job_title in enumerate(JOB_TITLES):
print("Pulling job title %s of %s" % (str(i + 1), str(len(JOB_TITLES))))
for _, location_config in tqdm(constituencies.iterrows(), total=constituencies.shape[0]):
location = location_config["PCON22NM"]
jobs = retry_api_call(job_title, location)
time.sleep(0.5)
if jobs["results"]:
for job in jobs['results']:
to_append = {
"job_title": job_title,
"search_location": location,
"search_location_code": location_config["PCON22CD"],
**job
}
results.append(to_append)
results_df = pd.DataFrame(results)

View file

@ -1,7 +0,0 @@
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path='data_collection/.env')
ADZUNA_API_KEY = os.environ.get('ADZUNA_API_KEY')
ADZUNA_APP_ID = os.environ.get('ADZUNA_APP_ID')

Binary file not shown.

View file

@ -1,651 +0,0 @@
PCON22CD,PCON22NM,ObjectId
E14000530,Aldershot,1
E14000531,Aldridge-Brownhills,2
E14000532,Altrincham and Sale West,3
E14000533,Amber Valley,4
E14000534,Arundel and South Downs,5
E14000535,Ashfield,6
E14000536,Ashford,7
E14000537,Ashton-under-Lyne,8
E14000538,Aylesbury,9
E14000539,Banbury,10
E14000540,Barking,11
E14000541,Barnsley Central,12
E14000542,Barnsley East,13
E14000543,Barrow and Furness,14
E14000544,Basildon and Billericay,15
E14000545,Basingstoke,16
E14000546,Bassetlaw,17
E14000547,Bath,18
E14000548,Batley and Spen,19
E14000549,Battersea,20
E14000550,Beaconsfield,21
E14000551,Beckenham,22
E14000552,Bedford,23
E14000553,Bermondsey and Old Southwark,24
E14000554,Berwick-upon-Tweed,25
E14000555,Bethnal Green and Bow,26
E14000556,Beverley and Holderness,27
E14000557,Bexhill and Battle,28
E14000558,Bexleyheath and Crayford,29
E14000559,Birkenhead,30
E14000560,"Birmingham, Edgbaston",31
E14000561,"Birmingham, Erdington",32
E14000562,"Birmingham, Hall Green",33
E14000563,"Birmingham, Hodge Hill",34
E14000564,"Birmingham, Ladywood",35
E14000565,"Birmingham, Northfield",36
E14000566,"Birmingham, Perry Barr",37
E14000567,"Birmingham, Selly Oak",38
E14000568,"Birmingham, Yardley",39
E14000569,Bishop Auckland,40
E14000570,Blackburn,41
E14000571,Blackley and Broughton,42
E14000572,Blackpool North and Cleveleys,43
E14000573,Blackpool South,44
E14000574,Blaydon,45
E14000575,Blyth Valley,46
E14000576,Bognor Regis and Littlehampton,47
E14000577,Bolsover,48
E14000578,Bolton North East,49
E14000579,Bolton South East,50
E14000830,Newbury,51
E14000831,Newcastle upon Tyne Central,52
E14000832,Newcastle upon Tyne East,53
E14000833,Newcastle upon Tyne North,54
E14000834,Newcastle-under-Lyme,55
E14000835,Newton Abbot,56
E14000836,"Normanton, Pontefract and Castleford",57
E14000837,North Cornwall,58
E14000838,North Devon,59
E14000839,North Dorset,60
E14000840,North Durham,61
E14000841,North East Bedfordshire,62
E14000842,North East Cambridgeshire,63
E14000843,North East Derbyshire,64
E14000844,North East Hampshire,65
E14000845,North East Hertfordshire,66
E14000846,North East Somerset,67
E14000847,North Herefordshire,68
E14000848,North Norfolk,69
E14000849,North Shropshire,70
E14000850,North Somerset,71
E14000851,North Swindon,72
E14000852,North Thanet,73
E14000853,North Tyneside,74
E14000854,North Warwickshire,75
E14000855,North West Cambridgeshire,76
E14000856,North West Durham,77
E14000857,North West Hampshire,78
E14000858,North West Leicestershire,79
E14000859,North West Norfolk,80
E14000860,North Wiltshire,81
E14000861,Northampton North,82
E14000862,Northampton South,83
E14000863,Norwich North,84
E14000864,Norwich South,85
E14000865,Nottingham East,86
E14000866,Nottingham North,87
E14000867,Nottingham South,88
E14000868,Nuneaton,89
E14000869,Old Bexley and Sidcup,90
E14000870,Oldham East and Saddleworth,91
E14000871,Oldham West and Royton,92
E14000872,Orpington,93
E14000873,Oxford East,94
E14000874,Oxford West and Abingdon,95
E14000875,Pendle,96
E14000876,Penistone and Stocksbridge,97
E14000877,Penrith and The Border,98
E14000878,Peterborough,99
E14000879,"Plymouth, Moor View",100
E14000580,Bolton West,101
E14000581,Bootle,102
E14000582,Boston and Skegness,103
E14000583,Bosworth,104
E14000584,Bournemouth East,105
E14000585,Bournemouth West,106
E14000586,Bracknell,107
E14000587,Bradford East,108
E14000588,Bradford South,109
E14000589,Bradford West,110
E14000590,Braintree,111
E14000591,Brent Central,112
E14000592,Brent North,113
E14000593,Brentford and Isleworth,114
E14000594,Brentwood and Ongar,115
E14000595,Bridgwater and West Somerset,116
E14000596,Brigg and Goole,117
E14000597,"Brighton, Kemptown",118
E14000598,"Brighton, Pavilion",119
E14000599,Bristol East,120
E14000600,Bristol North West,121
E14000601,Bristol South,122
E14000602,Bristol West,123
E14000603,Broadland,124
E14000604,Bromley and Chislehurst,125
E14000605,Bromsgrove,126
E14000606,Broxbourne,127
E14000607,Broxtowe,128
E14000608,Buckingham,129
E14000609,Burnley,130
E14000610,Burton,131
E14000611,Bury North,132
E14000612,Bury South,133
E14000613,Bury St Edmunds,134
E14000614,Calder Valley,135
E14000615,Camberwell and Peckham,136
E14000616,Camborne and Redruth,137
E14000617,Cambridge,138
E14000618,Cannock Chase,139
E14000619,Canterbury,140
E14000620,Carlisle,141
E14000621,Carshalton and Wallington,142
E14000622,Castle Point,143
E14000623,Central Devon,144
E14000624,Central Suffolk and North Ipswich,145
E14000625,Charnwood,146
E14000626,Chatham and Aylesford,147
E14000627,Cheadle,148
E14000628,Chelmsford,149
E14000629,Chelsea and Fulham,150
E14000630,Cheltenham,151
E14000631,Chesham and Amersham,152
E14000632,Chesterfield,153
E14000633,Chichester,154
E14000634,Chingford and Woodford Green,155
E14000635,Chippenham,156
E14000636,Chipping Barnet,157
E14000637,Chorley,158
E14000638,Christchurch,159
E14000639,Cities of London and Westminster,160
E14000640,City of Chester,161
E14000641,City of Durham,162
E14000642,Clacton,163
E14000643,Cleethorpes,164
E14000644,Colchester,165
E14000645,Colne Valley,166
E14000646,Congleton,167
E14000647,Copeland,168
E14000648,Corby,169
E14000649,Coventry North East,170
E14000650,Coventry North West,171
E14000651,Coventry South,172
E14000652,Crawley,173
E14000653,Crewe and Nantwich,174
E14000654,Croydon Central,175
E14000655,Croydon North,176
E14000656,Croydon South,177
E14000657,Dagenham and Rainham,178
E14000658,Darlington,179
E14000659,Dartford,180
E14000660,Daventry,181
E14000661,Denton and Reddish,182
E14000662,Derby North,183
E14000663,Derby South,184
E14000664,Derbyshire Dales,185
E14000665,Devizes,186
E14000666,Dewsbury,187
E14000667,Don Valley,188
E14000668,Doncaster Central,189
E14000669,Doncaster North,190
E14000670,Dover,191
E14000671,Dudley North,192
E14000672,Dudley South,193
E14000673,Dulwich and West Norwood,194
E14000674,Ealing Central and Acton,195
E14000675,Ealing North,196
E14000676,"Ealing, Southall",197
E14000677,Easington,198
E14000678,East Devon,199
E14000679,East Ham,200
E14000780,Leeds North West,201
E14000781,Leeds West,202
E14000782,Leicester East,203
E14000783,Leicester South,204
E14000784,Leicester West,205
E14000785,Leigh,206
E14000786,Lewes,207
E14000787,Lewisham East,208
E14000788,Lewisham West and Penge,209
E14000789,"Lewisham, Deptford",210
E14000790,Leyton and Wanstead,211
E14000791,Lichfield,212
E14000792,Lincoln,213
E14000793,"Liverpool, Riverside",214
E14000794,"Liverpool, Walton",215
E14000795,"Liverpool, Wavertree",216
E14000796,"Liverpool, West Derby",217
E14000797,Loughborough,218
E14000798,Louth and Horncastle,219
E14000799,Ludlow,220
E14000800,Luton North,221
E14000801,Luton South,222
E14000802,Macclesfield,223
E14000803,Maidenhead,224
E14000804,Maidstone and The Weald,225
E14000805,Makerfield,226
E14000806,Maldon,227
E14000807,Manchester Central,228
E14000808,"Manchester, Gorton",229
E14000809,"Manchester, Withington",230
E14000810,Mansfield,231
E14000811,Meon Valley,232
E14000812,Meriden,233
E14000813,Mid Bedfordshire,234
E14000814,Mid Derbyshire,235
E14000815,Mid Dorset and North Poole,236
E14000816,Mid Norfolk,237
E14000817,Mid Sussex,238
E14000818,Mid Worcestershire,239
E14000819,Middlesbrough,240
E14000820,Middlesbrough South and East Cleveland,241
E14000821,Milton Keynes North,242
E14000822,Milton Keynes South,243
E14000823,Mitcham and Morden,244
E14000824,Mole Valley,245
E14000825,Morecambe and Lunesdale,246
E14000826,Morley and Outwood,247
E14000827,New Forest East,248
E14000828,New Forest West,249
E14000829,Newark,250
E14000680,East Hampshire,251
E14000681,East Surrey,252
E14000682,East Worthing and Shoreham,253
E14000683,East Yorkshire,254
E14000880,"Plymouth, Sutton and Devonport",255
E14000684,Eastbourne,256
E14000685,Eastleigh,257
E14000881,Poole,258
E14000686,Eddisbury,259
E14000882,Poplar and Limehouse,260
E14000687,Edmonton,261
E14000883,Portsmouth North,262
E14000688,Ellesmere Port and Neston,263
E14000884,Portsmouth South,264
E14000689,Elmet and Rothwell,265
E14000885,Preston,266
E14000690,Eltham,267
E14000886,Pudsey,268
E14000691,Enfield North,269
E14000887,Putney,270
E14000692,"Enfield, Southgate",271
E14000888,Rayleigh and Wickford,272
E14000693,Epping Forest,273
E14000889,Reading East,274
E14000694,Epsom and Ewell,275
E14000890,Reading West,276
E14000695,Erewash,277
E14000891,Redcar,278
E14000696,Erith and Thamesmead,279
E14000892,Redditch,280
E14000697,Esher and Walton,281
E14000893,Reigate,282
E14000698,Exeter,283
E14000894,Ribble Valley,284
E14000699,Fareham,285
E14000895,Richmond (Yorks),286
E14000700,Faversham and Mid Kent,287
E14000896,Richmond Park,288
E14000701,Feltham and Heston,289
E14000897,Rochdale,290
E14000702,Filton and Bradley Stoke,291
E14000898,Rochester and Strood,292
E14000703,Finchley and Golders Green,293
E14000899,Rochford and Southend East,294
E14000704,Folkestone and Hythe,295
E14000900,Romford,296
E14000705,Forest of Dean,297
E14000901,Romsey and Southampton North,298
E14000706,Fylde,299
E14000902,Rossendale and Darwen,300
E14000707,Gainsborough,301
E14000903,Rother Valley,302
E14000904,Rotherham,303
E14000905,Rugby,304
E14000906,"Ruislip, Northwood and Pinner",305
E14000907,Runnymede and Weybridge,306
E14000908,Rushcliffe,307
E14000909,Rutland and Melton,308
E14000910,Saffron Walden,309
E14000911,Salford and Eccles,310
E14000912,Salisbury,311
E14000913,Scarborough and Whitby,312
E14000914,Scunthorpe,313
E14000915,Sedgefield,314
E14000916,Sefton Central,315
E14000917,Selby and Ainsty,316
E14000918,Sevenoaks,317
E14000919,Sheffield Central,318
E14000920,Sheffield South East,319
E14000921,"Sheffield, Brightside and Hillsborough",320
E14000922,"Sheffield, Hallam",321
E14000923,"Sheffield, Heeley",322
E14000924,Sherwood,323
E14000925,Shipley,324
E14000926,Shrewsbury and Atcham,325
E14000927,Sittingbourne and Sheppey,326
E14000928,Skipton and Ripon,327
E14000929,Sleaford and North Hykeham,328
E14000730,Harrogate and Knaresborough,329
E14000731,Harrow East,330
E14000732,Harrow West,331
E14000733,Hartlepool,332
E14000734,Harwich and North Essex,333
E14000735,Hastings and Rye,334
E14000736,Havant,335
E14000737,Hayes and Harlington,336
E14000738,Hazel Grove,337
E14000739,Hemel Hempstead,338
E14000740,Hemsworth,339
E14000741,Hendon,340
E14000742,Henley,341
E14000743,Hereford and South Herefordshire,342
E14000744,Hertford and Stortford,343
E14000745,Hertsmere,344
E14000746,Hexham,345
E14000747,Heywood and Middleton,346
E14000748,High Peak,347
E14000749,Hitchin and Harpenden,348
E14000750,Holborn and St Pancras,349
E14000751,Hornchurch and Upminster,350
E14000752,Hornsey and Wood Green,351
E14000753,Horsham,352
E14000754,Houghton and Sunderland South,353
E14000755,Hove,354
E14000756,Huddersfield,355
E14000757,Huntingdon,356
E14000758,Hyndburn,357
E14000759,Ilford North,358
E14000760,Ilford South,359
E14000761,Ipswich,360
E14000762,Isle of Wight,361
E14000763,Islington North,362
E14000764,Islington South and Finsbury,363
E14000765,Jarrow,364
E14000766,Keighley,365
E14000767,Kenilworth and Southam,366
E14000768,Kensington,367
E14000769,Kettering,368
E14000770,Kingston and Surbiton,369
E14000771,Kingston upon Hull East,370
E14000772,Kingston upon Hull North,371
E14000773,Kingston upon Hull West and Hessle,372
E14000774,Kingswood,373
E14000775,Knowsley,374
E14000776,Lancaster and Fleetwood,375
E14000777,Leeds Central,376
E14000778,Leeds East,377
E14000779,Leeds North East,378
E14000708,Garston and Halewood,379
E14000709,Gateshead,380
E14000710,Gedling,381
E14000711,Gillingham and Rainham,382
E14000712,Gloucester,383
E14000713,Gosport,384
E14000714,Grantham and Stamford,385
E14000715,Gravesham,386
E14000716,Great Grimsby,387
E14000717,Great Yarmouth,388
E14000718,Greenwich and Woolwich,389
E14000719,Guildford,390
E14000720,Hackney North and Stoke Newington,391
E14000721,Hackney South and Shoreditch,392
E14000722,Halesowen and Rowley Regis,393
E14000723,Halifax,394
E14000724,Haltemprice and Howden,395
E14000725,Halton,396
E14000726,Hammersmith,397
E14000727,Hampstead and Kilburn,398
E14000728,Harborough,399
E14000729,Harlow,400
E14000930,Slough,401
E14000931,Solihull,402
E14000932,Somerton and Frome,403
E14000933,South Basildon and East Thurrock,404
E14000934,South Cambridgeshire,405
E14000935,South Derbyshire,406
E14000936,South Dorset,407
E14000937,South East Cambridgeshire,408
E14000938,South East Cornwall,409
E14000939,South Holland and The Deepings,410
E14000940,South Leicestershire,411
E14000941,South Norfolk,412
E14000942,South Northamptonshire,413
E14000943,South Ribble,414
E14000944,South Shields,415
E14000945,South Staffordshire,416
E14000946,South Suffolk,417
E14000947,South Swindon,418
E14000948,South Thanet,419
E14000949,South West Bedfordshire,420
E14000950,South West Devon,421
E14000951,South West Hertfordshire,422
E14000952,South West Norfolk,423
E14000953,South West Surrey,424
E14000954,South West Wiltshire,425
E14000955,"Southampton, Itchen",426
E14000956,"Southampton, Test",427
E14000957,Southend West,428
E14000958,Southport,429
E14000959,Spelthorne,430
E14000960,St Albans,431
E14000961,St Austell and Newquay,432
E14000962,St Helens North,433
E14000963,St Helens South and Whiston,434
E14000964,St Ives,435
E14000965,Stafford,436
E14000966,Staffordshire Moorlands,437
E14000967,Stalybridge and Hyde,438
E14000968,Stevenage,439
E14000969,Stockport,440
E14000970,Stockton North,441
E14000971,Stockton South,442
E14000972,Stoke-on-Trent Central,443
E14000973,Stoke-on-Trent North,444
E14000974,Stoke-on-Trent South,445
E14000975,Stone,446
E14000976,Stourbridge,447
E14000977,Stratford-on-Avon,448
E14000978,Streatham,449
E14000979,Stretford and Urmston,450
E14000980,Stroud,451
E14000981,Suffolk Coastal,452
E14000982,Sunderland Central,453
E14000983,Surrey Heath,454
E14000984,Sutton and Cheam,455
E14000985,Sutton Coldfield,456
E14000986,Tamworth,457
E14000987,Tatton,458
E14000988,Taunton Deane,459
E14000989,Telford,460
E14000990,Tewkesbury,461
E14000991,The Cotswolds,462
E14000992,The Wrekin,463
E14000993,Thirsk and Malton,464
E14000994,Thornbury and Yate,465
E14000995,Thurrock,466
E14000996,Tiverton and Honiton,467
E14000997,Tonbridge and Malling,468
E14000998,Tooting,469
E14000999,Torbay,470
E14001000,Torridge and West Devon,471
E14001001,Totnes,472
E14001002,Tottenham,473
E14001003,Truro and Falmouth,474
E14001004,Tunbridge Wells,475
E14001005,Twickenham,476
E14001006,Tynemouth,477
E14001007,Uxbridge and South Ruislip,478
E14001008,Vauxhall,479
E14001009,Wakefield,480
E14001010,Wallasey,481
E14001011,Walsall North,482
E14001012,Walsall South,483
E14001013,Walthamstow,484
E14001014,Wansbeck,485
E14001015,Wantage,486
E14001016,Warley,487
E14001017,Warrington North,488
E14001018,Warrington South,489
E14001019,Warwick and Leamington,490
E14001020,Washington and Sunderland West,491
E14001021,Watford,492
E14001022,Waveney,493
E14001023,Wealden,494
E14001024,Weaver Vale,495
E14001025,Wellingborough,496
E14001026,Wells,497
E14001027,Welwyn Hatfield,498
E14001028,Wentworth and Dearne,499
E14001029,West Bromwich East,500
E14001030,West Bromwich West,501
E14001031,West Dorset,502
E14001032,West Ham,503
E14001033,West Lancashire,504
E14001034,West Suffolk,505
E14001035,West Worcestershire,506
E14001036,Westminster North,507
E14001037,Westmorland and Lonsdale,508
E14001038,Weston-Super-Mare,509
E14001039,Wigan,510
E14001040,Wimbledon,511
E14001041,Winchester,512
E14001042,Windsor,513
E14001043,Wirral South,514
E14001044,Wirral West,515
E14001045,Witham,516
E14001046,Witney,517
E14001047,Woking,518
E14001048,Wokingham,519
E14001049,Wolverhampton North East,520
E14001050,Wolverhampton South East,521
E14001051,Wolverhampton South West,522
E14001052,Worcester,523
E14001053,Workington,524
E14001054,Worsley and Eccles South,525
E14001055,Worthing West,526
E14001056,Wycombe,527
E14001057,Wyre and Preston North,528
E14001058,Wyre Forest,529
E14001059,Wythenshawe and Sale East,530
E14001060,Yeovil,531
E14001061,York Central,532
E14001062,York Outer,533
N06000001,Belfast East,534
N06000002,Belfast North,535
N06000003,Belfast South,536
N06000004,Belfast West,537
N06000005,East Antrim,538
N06000006,East Londonderry,539
N06000007,Fermanagh and South Tyrone,540
N06000008,Foyle,541
N06000009,Lagan Valley,542
N06000010,Mid Ulster,543
N06000011,Newry and Armagh,544
N06000012,North Antrim,545
N06000013,North Down,546
N06000014,South Antrim,547
N06000015,South Down,548
N06000016,Strangford,549
N06000017,Upper Bann,550
S14000050,Ochil and South Perthshire,551
S14000051,Orkney and Shetland,552
S14000052,Paisley and Renfrewshire North,553
S14000053,Paisley and Renfrewshire South,554
S14000054,Perth and North Perthshire,555
S14000055,"Ross, Skye and Lochaber",556
S14000056,Rutherglen and Hamilton West,557
S14000057,Stirling,558
S14000058,West Aberdeenshire and Kincardine,559
S14000059,West Dunbartonshire,560
W07000041,Ynys Môn,561
W07000042,Delyn,562
W07000043,Alyn and Deeside,563
W07000044,Wrexham,564
W07000045,Llanelli,565
W07000046,Gower,566
W07000047,Swansea West,567
W07000048,Swansea East,568
W07000049,Aberavon,569
W07000050,Cardiff Central,570
W07000051,Cardiff North,571
W07000052,Rhondda,572
W07000053,Torfaen,573
W07000054,Monmouth,574
W07000055,Newport East,575
W07000056,Newport West,576
W07000057,Arfon,577
W07000058,Aberconwy,578
W07000059,Clwyd West,579
W07000060,Vale of Clwyd,580
W07000061,Dwyfor Meirionnydd,581
W07000062,Clwyd South,582
W07000063,Montgomeryshire,583
W07000064,Ceredigion,584
W07000065,Preseli Pembrokeshire,585
W07000066,Carmarthen West and South Pembrokeshire,586
W07000067,Carmarthen East and Dinefwr,587
W07000068,Brecon and Radnorshire,588
W07000069,Neath,589
W07000070,Cynon Valley,590
W07000071,Merthyr Tydfil and Rhymney,591
W07000072,Blaenau Gwent,592
W07000073,Bridgend,593
W07000074,Ogmore,594
W07000075,Pontypridd,595
W07000076,Caerphilly,596
W07000077,Islwyn,597
W07000078,Vale of Glamorgan,598
W07000079,Cardiff West,599
W07000080,Cardiff South and Penarth,600
N06000018,West Tyrone,601
S14000001,Aberdeen North,602
S14000002,Aberdeen South,603
S14000003,Airdrie and Shotts,604
S14000004,Angus,605
S14000005,Argyll and Bute,606
S14000006,"Ayr, Carrick and Cumnock",607
S14000007,Banff and Buchan,608
S14000008,"Berwickshire, Roxburgh and Selkirk",609
S14000009,"Caithness, Sutherland and Easter Ross",610
S14000010,Central Ayrshire,611
S14000011,"Coatbridge, Chryston and Bellshill",612
S14000012,"Cumbernauld, Kilsyth and Kirkintilloch East",613
S14000013,Dumfries and Galloway,614
S14000014,"Dumfriesshire, Clydesdale and Tweeddale",615
S14000015,Dundee East,616
S14000016,Dundee West,617
S14000017,Dunfermline and West Fife,618
S14000018,East Dunbartonshire,619
S14000019,"East Kilbride, Strathaven and Lesmahagow",620
S14000020,East Lothian,621
S14000021,East Renfrewshire,622
S14000022,Edinburgh East,623
S14000023,Edinburgh North and Leith,624
S14000024,Edinburgh South,625
S14000025,Edinburgh South West,626
S14000026,Edinburgh West,627
S14000027,Na h-Eileanan an Iar,628
S14000028,Falkirk,629
S14000029,Glasgow Central,630
S14000030,Glasgow East,631
S14000031,Glasgow North,632
S14000032,Glasgow North East,633
S14000033,Glasgow North West,634
S14000034,Glasgow South,635
S14000035,Glasgow South West,636
S14000036,Glenrothes,637
S14000037,Gordon,638
S14000038,Inverclyde,639
S14000039,"Inverness, Nairn, Badenoch and Strathspey",640
S14000040,Kilmarnock and Loudoun,641
S14000041,Kirkcaldy and Cowdenbeath,642
S14000042,Lanark and Hamilton East,643
S14000043,Linlithgow and East Falkirk,644
S14000044,Livingston,645
S14000045,Midlothian,646
S14000046,Moray,647
S14000047,Motherwell and Wishaw,648
S14000048,North Ayrshire and Arran,649
S14000049,North East Fife,650
1 PCON22CD PCON22NM ObjectId
2 E14000530 Aldershot 1
3 E14000531 Aldridge-Brownhills 2
4 E14000532 Altrincham and Sale West 3
5 E14000533 Amber Valley 4
6 E14000534 Arundel and South Downs 5
7 E14000535 Ashfield 6
8 E14000536 Ashford 7
9 E14000537 Ashton-under-Lyne 8
10 E14000538 Aylesbury 9
11 E14000539 Banbury 10
12 E14000540 Barking 11
13 E14000541 Barnsley Central 12
14 E14000542 Barnsley East 13
15 E14000543 Barrow and Furness 14
16 E14000544 Basildon and Billericay 15
17 E14000545 Basingstoke 16
18 E14000546 Bassetlaw 17
19 E14000547 Bath 18
20 E14000548 Batley and Spen 19
21 E14000549 Battersea 20
22 E14000550 Beaconsfield 21
23 E14000551 Beckenham 22
24 E14000552 Bedford 23
25 E14000553 Bermondsey and Old Southwark 24
26 E14000554 Berwick-upon-Tweed 25
27 E14000555 Bethnal Green and Bow 26
28 E14000556 Beverley and Holderness 27
29 E14000557 Bexhill and Battle 28
30 E14000558 Bexleyheath and Crayford 29
31 E14000559 Birkenhead 30
32 E14000560 Birmingham, Edgbaston 31
33 E14000561 Birmingham, Erdington 32
34 E14000562 Birmingham, Hall Green 33
35 E14000563 Birmingham, Hodge Hill 34
36 E14000564 Birmingham, Ladywood 35
37 E14000565 Birmingham, Northfield 36
38 E14000566 Birmingham, Perry Barr 37
39 E14000567 Birmingham, Selly Oak 38
40 E14000568 Birmingham, Yardley 39
41 E14000569 Bishop Auckland 40
42 E14000570 Blackburn 41
43 E14000571 Blackley and Broughton 42
44 E14000572 Blackpool North and Cleveleys 43
45 E14000573 Blackpool South 44
46 E14000574 Blaydon 45
47 E14000575 Blyth Valley 46
48 E14000576 Bognor Regis and Littlehampton 47
49 E14000577 Bolsover 48
50 E14000578 Bolton North East 49
51 E14000579 Bolton South East 50
52 E14000830 Newbury 51
53 E14000831 Newcastle upon Tyne Central 52
54 E14000832 Newcastle upon Tyne East 53
55 E14000833 Newcastle upon Tyne North 54
56 E14000834 Newcastle-under-Lyme 55
57 E14000835 Newton Abbot 56
58 E14000836 Normanton, Pontefract and Castleford 57
59 E14000837 North Cornwall 58
60 E14000838 North Devon 59
61 E14000839 North Dorset 60
62 E14000840 North Durham 61
63 E14000841 North East Bedfordshire 62
64 E14000842 North East Cambridgeshire 63
65 E14000843 North East Derbyshire 64
66 E14000844 North East Hampshire 65
67 E14000845 North East Hertfordshire 66
68 E14000846 North East Somerset 67
69 E14000847 North Herefordshire 68
70 E14000848 North Norfolk 69
71 E14000849 North Shropshire 70
72 E14000850 North Somerset 71
73 E14000851 North Swindon 72
74 E14000852 North Thanet 73
75 E14000853 North Tyneside 74
76 E14000854 North Warwickshire 75
77 E14000855 North West Cambridgeshire 76
78 E14000856 North West Durham 77
79 E14000857 North West Hampshire 78
80 E14000858 North West Leicestershire 79
81 E14000859 North West Norfolk 80
82 E14000860 North Wiltshire 81
83 E14000861 Northampton North 82
84 E14000862 Northampton South 83
85 E14000863 Norwich North 84
86 E14000864 Norwich South 85
87 E14000865 Nottingham East 86
88 E14000866 Nottingham North 87
89 E14000867 Nottingham South 88
90 E14000868 Nuneaton 89
91 E14000869 Old Bexley and Sidcup 90
92 E14000870 Oldham East and Saddleworth 91
93 E14000871 Oldham West and Royton 92
94 E14000872 Orpington 93
95 E14000873 Oxford East 94
96 E14000874 Oxford West and Abingdon 95
97 E14000875 Pendle 96
98 E14000876 Penistone and Stocksbridge 97
99 E14000877 Penrith and The Border 98
100 E14000878 Peterborough 99
101 E14000879 Plymouth, Moor View 100
102 E14000580 Bolton West 101
103 E14000581 Bootle 102
104 E14000582 Boston and Skegness 103
105 E14000583 Bosworth 104
106 E14000584 Bournemouth East 105
107 E14000585 Bournemouth West 106
108 E14000586 Bracknell 107
109 E14000587 Bradford East 108
110 E14000588 Bradford South 109
111 E14000589 Bradford West 110
112 E14000590 Braintree 111
113 E14000591 Brent Central 112
114 E14000592 Brent North 113
115 E14000593 Brentford and Isleworth 114
116 E14000594 Brentwood and Ongar 115
117 E14000595 Bridgwater and West Somerset 116
118 E14000596 Brigg and Goole 117
119 E14000597 Brighton, Kemptown 118
120 E14000598 Brighton, Pavilion 119
121 E14000599 Bristol East 120
122 E14000600 Bristol North West 121
123 E14000601 Bristol South 122
124 E14000602 Bristol West 123
125 E14000603 Broadland 124
126 E14000604 Bromley and Chislehurst 125
127 E14000605 Bromsgrove 126
128 E14000606 Broxbourne 127
129 E14000607 Broxtowe 128
130 E14000608 Buckingham 129
131 E14000609 Burnley 130
132 E14000610 Burton 131
133 E14000611 Bury North 132
134 E14000612 Bury South 133
135 E14000613 Bury St Edmunds 134
136 E14000614 Calder Valley 135
137 E14000615 Camberwell and Peckham 136
138 E14000616 Camborne and Redruth 137
139 E14000617 Cambridge 138
140 E14000618 Cannock Chase 139
141 E14000619 Canterbury 140
142 E14000620 Carlisle 141
143 E14000621 Carshalton and Wallington 142
144 E14000622 Castle Point 143
145 E14000623 Central Devon 144
146 E14000624 Central Suffolk and North Ipswich 145
147 E14000625 Charnwood 146
148 E14000626 Chatham and Aylesford 147
149 E14000627 Cheadle 148
150 E14000628 Chelmsford 149
151 E14000629 Chelsea and Fulham 150
152 E14000630 Cheltenham 151
153 E14000631 Chesham and Amersham 152
154 E14000632 Chesterfield 153
155 E14000633 Chichester 154
156 E14000634 Chingford and Woodford Green 155
157 E14000635 Chippenham 156
158 E14000636 Chipping Barnet 157
159 E14000637 Chorley 158
160 E14000638 Christchurch 159
161 E14000639 Cities of London and Westminster 160
162 E14000640 City of Chester 161
163 E14000641 City of Durham 162
164 E14000642 Clacton 163
165 E14000643 Cleethorpes 164
166 E14000644 Colchester 165
167 E14000645 Colne Valley 166
168 E14000646 Congleton 167
169 E14000647 Copeland 168
170 E14000648 Corby 169
171 E14000649 Coventry North East 170
172 E14000650 Coventry North West 171
173 E14000651 Coventry South 172
174 E14000652 Crawley 173
175 E14000653 Crewe and Nantwich 174
176 E14000654 Croydon Central 175
177 E14000655 Croydon North 176
178 E14000656 Croydon South 177
179 E14000657 Dagenham and Rainham 178
180 E14000658 Darlington 179
181 E14000659 Dartford 180
182 E14000660 Daventry 181
183 E14000661 Denton and Reddish 182
184 E14000662 Derby North 183
185 E14000663 Derby South 184
186 E14000664 Derbyshire Dales 185
187 E14000665 Devizes 186
188 E14000666 Dewsbury 187
189 E14000667 Don Valley 188
190 E14000668 Doncaster Central 189
191 E14000669 Doncaster North 190
192 E14000670 Dover 191
193 E14000671 Dudley North 192
194 E14000672 Dudley South 193
195 E14000673 Dulwich and West Norwood 194
196 E14000674 Ealing Central and Acton 195
197 E14000675 Ealing North 196
198 E14000676 Ealing, Southall 197
199 E14000677 Easington 198
200 E14000678 East Devon 199
201 E14000679 East Ham 200
202 E14000780 Leeds North West 201
203 E14000781 Leeds West 202
204 E14000782 Leicester East 203
205 E14000783 Leicester South 204
206 E14000784 Leicester West 205
207 E14000785 Leigh 206
208 E14000786 Lewes 207
209 E14000787 Lewisham East 208
210 E14000788 Lewisham West and Penge 209
211 E14000789 Lewisham, Deptford 210
212 E14000790 Leyton and Wanstead 211
213 E14000791 Lichfield 212
214 E14000792 Lincoln 213
215 E14000793 Liverpool, Riverside 214
216 E14000794 Liverpool, Walton 215
217 E14000795 Liverpool, Wavertree 216
218 E14000796 Liverpool, West Derby 217
219 E14000797 Loughborough 218
220 E14000798 Louth and Horncastle 219
221 E14000799 Ludlow 220
222 E14000800 Luton North 221
223 E14000801 Luton South 222
224 E14000802 Macclesfield 223
225 E14000803 Maidenhead 224
226 E14000804 Maidstone and The Weald 225
227 E14000805 Makerfield 226
228 E14000806 Maldon 227
229 E14000807 Manchester Central 228
230 E14000808 Manchester, Gorton 229
231 E14000809 Manchester, Withington 230
232 E14000810 Mansfield 231
233 E14000811 Meon Valley 232
234 E14000812 Meriden 233
235 E14000813 Mid Bedfordshire 234
236 E14000814 Mid Derbyshire 235
237 E14000815 Mid Dorset and North Poole 236
238 E14000816 Mid Norfolk 237
239 E14000817 Mid Sussex 238
240 E14000818 Mid Worcestershire 239
241 E14000819 Middlesbrough 240
242 E14000820 Middlesbrough South and East Cleveland 241
243 E14000821 Milton Keynes North 242
244 E14000822 Milton Keynes South 243
245 E14000823 Mitcham and Morden 244
246 E14000824 Mole Valley 245
247 E14000825 Morecambe and Lunesdale 246
248 E14000826 Morley and Outwood 247
249 E14000827 New Forest East 248
250 E14000828 New Forest West 249
251 E14000829 Newark 250
252 E14000680 East Hampshire 251
253 E14000681 East Surrey 252
254 E14000682 East Worthing and Shoreham 253
255 E14000683 East Yorkshire 254
256 E14000880 Plymouth, Sutton and Devonport 255
257 E14000684 Eastbourne 256
258 E14000685 Eastleigh 257
259 E14000881 Poole 258
260 E14000686 Eddisbury 259
261 E14000882 Poplar and Limehouse 260
262 E14000687 Edmonton 261
263 E14000883 Portsmouth North 262
264 E14000688 Ellesmere Port and Neston 263
265 E14000884 Portsmouth South 264
266 E14000689 Elmet and Rothwell 265
267 E14000885 Preston 266
268 E14000690 Eltham 267
269 E14000886 Pudsey 268
270 E14000691 Enfield North 269
271 E14000887 Putney 270
272 E14000692 Enfield, Southgate 271
273 E14000888 Rayleigh and Wickford 272
274 E14000693 Epping Forest 273
275 E14000889 Reading East 274
276 E14000694 Epsom and Ewell 275
277 E14000890 Reading West 276
278 E14000695 Erewash 277
279 E14000891 Redcar 278
280 E14000696 Erith and Thamesmead 279
281 E14000892 Redditch 280
282 E14000697 Esher and Walton 281
283 E14000893 Reigate 282
284 E14000698 Exeter 283
285 E14000894 Ribble Valley 284
286 E14000699 Fareham 285
287 E14000895 Richmond (Yorks) 286
288 E14000700 Faversham and Mid Kent 287
289 E14000896 Richmond Park 288
290 E14000701 Feltham and Heston 289
291 E14000897 Rochdale 290
292 E14000702 Filton and Bradley Stoke 291
293 E14000898 Rochester and Strood 292
294 E14000703 Finchley and Golders Green 293
295 E14000899 Rochford and Southend East 294
296 E14000704 Folkestone and Hythe 295
297 E14000900 Romford 296
298 E14000705 Forest of Dean 297
299 E14000901 Romsey and Southampton North 298
300 E14000706 Fylde 299
301 E14000902 Rossendale and Darwen 300
302 E14000707 Gainsborough 301
303 E14000903 Rother Valley 302
304 E14000904 Rotherham 303
305 E14000905 Rugby 304
306 E14000906 Ruislip, Northwood and Pinner 305
307 E14000907 Runnymede and Weybridge 306
308 E14000908 Rushcliffe 307
309 E14000909 Rutland and Melton 308
310 E14000910 Saffron Walden 309
311 E14000911 Salford and Eccles 310
312 E14000912 Salisbury 311
313 E14000913 Scarborough and Whitby 312
314 E14000914 Scunthorpe 313
315 E14000915 Sedgefield 314
316 E14000916 Sefton Central 315
317 E14000917 Selby and Ainsty 316
318 E14000918 Sevenoaks 317
319 E14000919 Sheffield Central 318
320 E14000920 Sheffield South East 319
321 E14000921 Sheffield, Brightside and Hillsborough 320
322 E14000922 Sheffield, Hallam 321
323 E14000923 Sheffield, Heeley 322
324 E14000924 Sherwood 323
325 E14000925 Shipley 324
326 E14000926 Shrewsbury and Atcham 325
327 E14000927 Sittingbourne and Sheppey 326
328 E14000928 Skipton and Ripon 327
329 E14000929 Sleaford and North Hykeham 328
330 E14000730 Harrogate and Knaresborough 329
331 E14000731 Harrow East 330
332 E14000732 Harrow West 331
333 E14000733 Hartlepool 332
334 E14000734 Harwich and North Essex 333
335 E14000735 Hastings and Rye 334
336 E14000736 Havant 335
337 E14000737 Hayes and Harlington 336
338 E14000738 Hazel Grove 337
339 E14000739 Hemel Hempstead 338
340 E14000740 Hemsworth 339
341 E14000741 Hendon 340
342 E14000742 Henley 341
343 E14000743 Hereford and South Herefordshire 342
344 E14000744 Hertford and Stortford 343
345 E14000745 Hertsmere 344
346 E14000746 Hexham 345
347 E14000747 Heywood and Middleton 346
348 E14000748 High Peak 347
349 E14000749 Hitchin and Harpenden 348
350 E14000750 Holborn and St Pancras 349
351 E14000751 Hornchurch and Upminster 350
352 E14000752 Hornsey and Wood Green 351
353 E14000753 Horsham 352
354 E14000754 Houghton and Sunderland South 353
355 E14000755 Hove 354
356 E14000756 Huddersfield 355
357 E14000757 Huntingdon 356
358 E14000758 Hyndburn 357
359 E14000759 Ilford North 358
360 E14000760 Ilford South 359
361 E14000761 Ipswich 360
362 E14000762 Isle of Wight 361
363 E14000763 Islington North 362
364 E14000764 Islington South and Finsbury 363
365 E14000765 Jarrow 364
366 E14000766 Keighley 365
367 E14000767 Kenilworth and Southam 366
368 E14000768 Kensington 367
369 E14000769 Kettering 368
370 E14000770 Kingston and Surbiton 369
371 E14000771 Kingston upon Hull East 370
372 E14000772 Kingston upon Hull North 371
373 E14000773 Kingston upon Hull West and Hessle 372
374 E14000774 Kingswood 373
375 E14000775 Knowsley 374
376 E14000776 Lancaster and Fleetwood 375
377 E14000777 Leeds Central 376
378 E14000778 Leeds East 377
379 E14000779 Leeds North East 378
380 E14000708 Garston and Halewood 379
381 E14000709 Gateshead 380
382 E14000710 Gedling 381
383 E14000711 Gillingham and Rainham 382
384 E14000712 Gloucester 383
385 E14000713 Gosport 384
386 E14000714 Grantham and Stamford 385
387 E14000715 Gravesham 386
388 E14000716 Great Grimsby 387
389 E14000717 Great Yarmouth 388
390 E14000718 Greenwich and Woolwich 389
391 E14000719 Guildford 390
392 E14000720 Hackney North and Stoke Newington 391
393 E14000721 Hackney South and Shoreditch 392
394 E14000722 Halesowen and Rowley Regis 393
395 E14000723 Halifax 394
396 E14000724 Haltemprice and Howden 395
397 E14000725 Halton 396
398 E14000726 Hammersmith 397
399 E14000727 Hampstead and Kilburn 398
400 E14000728 Harborough 399
401 E14000729 Harlow 400
402 E14000930 Slough 401
403 E14000931 Solihull 402
404 E14000932 Somerton and Frome 403
405 E14000933 South Basildon and East Thurrock 404
406 E14000934 South Cambridgeshire 405
407 E14000935 South Derbyshire 406
408 E14000936 South Dorset 407
409 E14000937 South East Cambridgeshire 408
410 E14000938 South East Cornwall 409
411 E14000939 South Holland and The Deepings 410
412 E14000940 South Leicestershire 411
413 E14000941 South Norfolk 412
414 E14000942 South Northamptonshire 413
415 E14000943 South Ribble 414
416 E14000944 South Shields 415
417 E14000945 South Staffordshire 416
418 E14000946 South Suffolk 417
419 E14000947 South Swindon 418
420 E14000948 South Thanet 419
421 E14000949 South West Bedfordshire 420
422 E14000950 South West Devon 421
423 E14000951 South West Hertfordshire 422
424 E14000952 South West Norfolk 423
425 E14000953 South West Surrey 424
426 E14000954 South West Wiltshire 425
427 E14000955 Southampton, Itchen 426
428 E14000956 Southampton, Test 427
429 E14000957 Southend West 428
430 E14000958 Southport 429
431 E14000959 Spelthorne 430
432 E14000960 St Albans 431
433 E14000961 St Austell and Newquay 432
434 E14000962 St Helens North 433
435 E14000963 St Helens South and Whiston 434
436 E14000964 St Ives 435
437 E14000965 Stafford 436
438 E14000966 Staffordshire Moorlands 437
439 E14000967 Stalybridge and Hyde 438
440 E14000968 Stevenage 439
441 E14000969 Stockport 440
442 E14000970 Stockton North 441
443 E14000971 Stockton South 442
444 E14000972 Stoke-on-Trent Central 443
445 E14000973 Stoke-on-Trent North 444
446 E14000974 Stoke-on-Trent South 445
447 E14000975 Stone 446
448 E14000976 Stourbridge 447
449 E14000977 Stratford-on-Avon 448
450 E14000978 Streatham 449
451 E14000979 Stretford and Urmston 450
452 E14000980 Stroud 451
453 E14000981 Suffolk Coastal 452
454 E14000982 Sunderland Central 453
455 E14000983 Surrey Heath 454
456 E14000984 Sutton and Cheam 455
457 E14000985 Sutton Coldfield 456
458 E14000986 Tamworth 457
459 E14000987 Tatton 458
460 E14000988 Taunton Deane 459
461 E14000989 Telford 460
462 E14000990 Tewkesbury 461
463 E14000991 The Cotswolds 462
464 E14000992 The Wrekin 463
465 E14000993 Thirsk and Malton 464
466 E14000994 Thornbury and Yate 465
467 E14000995 Thurrock 466
468 E14000996 Tiverton and Honiton 467
469 E14000997 Tonbridge and Malling 468
470 E14000998 Tooting 469
471 E14000999 Torbay 470
472 E14001000 Torridge and West Devon 471
473 E14001001 Totnes 472
474 E14001002 Tottenham 473
475 E14001003 Truro and Falmouth 474
476 E14001004 Tunbridge Wells 475
477 E14001005 Twickenham 476
478 E14001006 Tynemouth 477
479 E14001007 Uxbridge and South Ruislip 478
480 E14001008 Vauxhall 479
481 E14001009 Wakefield 480
482 E14001010 Wallasey 481
483 E14001011 Walsall North 482
484 E14001012 Walsall South 483
485 E14001013 Walthamstow 484
486 E14001014 Wansbeck 485
487 E14001015 Wantage 486
488 E14001016 Warley 487
489 E14001017 Warrington North 488
490 E14001018 Warrington South 489
491 E14001019 Warwick and Leamington 490
492 E14001020 Washington and Sunderland West 491
493 E14001021 Watford 492
494 E14001022 Waveney 493
495 E14001023 Wealden 494
496 E14001024 Weaver Vale 495
497 E14001025 Wellingborough 496
498 E14001026 Wells 497
499 E14001027 Welwyn Hatfield 498
500 E14001028 Wentworth and Dearne 499
501 E14001029 West Bromwich East 500
502 E14001030 West Bromwich West 501
503 E14001031 West Dorset 502
504 E14001032 West Ham 503
505 E14001033 West Lancashire 504
506 E14001034 West Suffolk 505
507 E14001035 West Worcestershire 506
508 E14001036 Westminster North 507
509 E14001037 Westmorland and Lonsdale 508
510 E14001038 Weston-Super-Mare 509
511 E14001039 Wigan 510
512 E14001040 Wimbledon 511
513 E14001041 Winchester 512
514 E14001042 Windsor 513
515 E14001043 Wirral South 514
516 E14001044 Wirral West 515
517 E14001045 Witham 516
518 E14001046 Witney 517
519 E14001047 Woking 518
520 E14001048 Wokingham 519
521 E14001049 Wolverhampton North East 520
522 E14001050 Wolverhampton South East 521
523 E14001051 Wolverhampton South West 522
524 E14001052 Worcester 523
525 E14001053 Workington 524
526 E14001054 Worsley and Eccles South 525
527 E14001055 Worthing West 526
528 E14001056 Wycombe 527
529 E14001057 Wyre and Preston North 528
530 E14001058 Wyre Forest 529
531 E14001059 Wythenshawe and Sale East 530
532 E14001060 Yeovil 531
533 E14001061 York Central 532
534 E14001062 York Outer 533
535 N06000001 Belfast East 534
536 N06000002 Belfast North 535
537 N06000003 Belfast South 536
538 N06000004 Belfast West 537
539 N06000005 East Antrim 538
540 N06000006 East Londonderry 539
541 N06000007 Fermanagh and South Tyrone 540
542 N06000008 Foyle 541
543 N06000009 Lagan Valley 542
544 N06000010 Mid Ulster 543
545 N06000011 Newry and Armagh 544
546 N06000012 North Antrim 545
547 N06000013 North Down 546
548 N06000014 South Antrim 547
549 N06000015 South Down 548
550 N06000016 Strangford 549
551 N06000017 Upper Bann 550
552 S14000050 Ochil and South Perthshire 551
553 S14000051 Orkney and Shetland 552
554 S14000052 Paisley and Renfrewshire North 553
555 S14000053 Paisley and Renfrewshire South 554
556 S14000054 Perth and North Perthshire 555
557 S14000055 Ross, Skye and Lochaber 556
558 S14000056 Rutherglen and Hamilton West 557
559 S14000057 Stirling 558
560 S14000058 West Aberdeenshire and Kincardine 559
561 S14000059 West Dunbartonshire 560
562 W07000041 Ynys Môn 561
563 W07000042 Delyn 562
564 W07000043 Alyn and Deeside 563
565 W07000044 Wrexham 564
566 W07000045 Llanelli 565
567 W07000046 Gower 566
568 W07000047 Swansea West 567
569 W07000048 Swansea East 568
570 W07000049 Aberavon 569
571 W07000050 Cardiff Central 570
572 W07000051 Cardiff North 571
573 W07000052 Rhondda 572
574 W07000053 Torfaen 573
575 W07000054 Monmouth 574
576 W07000055 Newport East 575
577 W07000056 Newport West 576
578 W07000057 Arfon 577
579 W07000058 Aberconwy 578
580 W07000059 Clwyd West 579
581 W07000060 Vale of Clwyd 580
582 W07000061 Dwyfor Meirionnydd 581
583 W07000062 Clwyd South 582
584 W07000063 Montgomeryshire 583
585 W07000064 Ceredigion 584
586 W07000065 Preseli Pembrokeshire 585
587 W07000066 Carmarthen West and South Pembrokeshire 586
588 W07000067 Carmarthen East and Dinefwr 587
589 W07000068 Brecon and Radnorshire 588
590 W07000069 Neath 589
591 W07000070 Cynon Valley 590
592 W07000071 Merthyr Tydfil and Rhymney 591
593 W07000072 Blaenau Gwent 592
594 W07000073 Bridgend 593
595 W07000074 Ogmore 594
596 W07000075 Pontypridd 595
597 W07000076 Caerphilly 596
598 W07000077 Islwyn 597
599 W07000078 Vale of Glamorgan 598
600 W07000079 Cardiff West 599
601 W07000080 Cardiff South and Penarth 600
602 N06000018 West Tyrone 601
603 S14000001 Aberdeen North 602
604 S14000002 Aberdeen South 603
605 S14000003 Airdrie and Shotts 604
606 S14000004 Angus 605
607 S14000005 Argyll and Bute 606
608 S14000006 Ayr, Carrick and Cumnock 607
609 S14000007 Banff and Buchan 608
610 S14000008 Berwickshire, Roxburgh and Selkirk 609
611 S14000009 Caithness, Sutherland and Easter Ross 610
612 S14000010 Central Ayrshire 611
613 S14000011 Coatbridge, Chryston and Bellshill 612
614 S14000012 Cumbernauld, Kilsyth and Kirkintilloch East 613
615 S14000013 Dumfries and Galloway 614
616 S14000014 Dumfriesshire, Clydesdale and Tweeddale 615
617 S14000015 Dundee East 616
618 S14000016 Dundee West 617
619 S14000017 Dunfermline and West Fife 618
620 S14000018 East Dunbartonshire 619
621 S14000019 East Kilbride, Strathaven and Lesmahagow 620
622 S14000020 East Lothian 621
623 S14000021 East Renfrewshire 622
624 S14000022 Edinburgh East 623
625 S14000023 Edinburgh North and Leith 624
626 S14000024 Edinburgh South 625
627 S14000025 Edinburgh South West 626
628 S14000026 Edinburgh West 627
629 S14000027 Na h-Eileanan an Iar 628
630 S14000028 Falkirk 629
631 S14000029 Glasgow Central 630
632 S14000030 Glasgow East 631
633 S14000031 Glasgow North 632
634 S14000032 Glasgow North East 633
635 S14000033 Glasgow North West 634
636 S14000034 Glasgow South 635
637 S14000035 Glasgow South West 636
638 S14000036 Glenrothes 637
639 S14000037 Gordon 638
640 S14000038 Inverclyde 639
641 S14000039 Inverness, Nairn, Badenoch and Strathspey 640
642 S14000040 Kilmarnock and Loudoun 641
643 S14000041 Kirkcaldy and Cowdenbeath 642
644 S14000042 Lanark and Hamilton East 643
645 S14000043 Linlithgow and East Falkirk 644
646 S14000044 Livingston 645
647 S14000045 Midlothian 646
648 S14000046 Moray 647
649 S14000047 Motherwell and Wishaw 648
650 S14000048 North Ayrshire and Arran 649
651 S14000049 North East Fife 650

View file

@ -1 +0,0 @@

View file

@ -1,4 +0,0 @@
requests
python-dotenv
pandas
tqdm

View file

@ -1,26 +1,61 @@
from pathlib import Path
import numpy as np
import pandas as pd
from model_data.BaseUtility import Definitions
from model_data.simulation_system.core.Settings import (
from BaseUtility import Definitions
from etl.epc.settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,
AVERAGE_FIXED_FEATURES,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,
COLUMNS_TO_MERGE_ON,
COMPONENT_FEATURES,
FIXED_FEATURES,
COLUMNTYPES,
RDSAP_RESPONSE,
MAX_SAP_SCORE,
fill_na_map,
FIXED_DESCRIPTON_MAPPED_FEATURES
STARTING_SUFFIX_COMPONENT_COLS,
NO_SUFFIX_COMPONENT_COLS,
ENDING_SUFFIX_COMPONENT_COLS
)
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
from typing import List
# These lookups are used to clean the construction age band
bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
][0] for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
class DataProcessor:
"""
@ -46,66 +81,36 @@ class DataProcessor:
def insert_data(self, data: pd.DataFrame) -> None:
self.data = data
@staticmethod
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
def standardise_construction_age_band(self):
"""
This function will tidy up some of the non-standard values that are populated in the construction age
band, which is useful for cleaning
"""
bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
][0] for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: clean_construction_age_band(x)
lambda x: self.clean_construction_age_band(x)
)
self.data = self.data[
@ -157,18 +162,6 @@ class DataProcessor:
break
to_index -= 1
def reformat_columns(self):
"""
This function applies the re-formattng of columns from lower case to capitalised
When requesting the epc data from the api, the columns are lower case
and separated by a hyphen, whereas in the bulk download, the columns
are capitalised and separated by underscores. If rename_columns is True
we convert the columns from lower case to capitalised format
:return:
"""
self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
def pre_process(self) -> pd.DataFrame:
"""
Load data and begin initial cleaning
@ -176,22 +169,24 @@ class DataProcessor:
if self.data is None:
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
if self.newdata:
self.reformat_columns()
if not self.newdata:
self.confine_data()
self.remap_columns()
# We have some non-standard construction age bands which we'll clean for matching
self.standardise_construction_age_band()
self.clean_missing_rooms()
if not self.newdata:
self.standardise_construction_age_band()
self.clean_missing_rooms()
self.recast_df_columns(
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
)
self.clean_multi_glaze_proportion()
if not self.newdata:
self.clean_multi_glaze_proportion()
self.clean_photo_supply()
if not self.newdata:
@ -203,16 +198,24 @@ class DataProcessor:
# If we have multiple EPC records, we can try and do filling
self.fill_na_fields()
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
if not self.newdata:
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
# Final re-casting after data transformed and prepared
self.data = self.data.astype(COLUMNTYPES)
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
self.data = self.data.astype(coltypes)
self.na_remapping()
return self.data
def na_remapping(self):
for column, fill_value in fill_na_map.items():
fill_na_map_apply = {
k: v for k, v in fill_na_map.items() if k in self.data.columns
} if self.newdata else fill_na_map
for column, fill_value in fill_na_map_apply.items():
self.data[column] = self.data[column].fillna(fill_value)
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
@ -255,7 +258,8 @@ class DataProcessor:
data = data.replace(np.NAN, None)
# Remap certain columns
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
if not self.newdata:
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
convert_to_lower = ["TRANSACTION_TYPE"]
@ -348,7 +352,7 @@ class DataProcessor:
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
# If there still is na values, use average across all properties in consituecy
# If there still is na values, use average across all epc in consituecy
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[variable].mean())
@ -497,9 +501,15 @@ class DataProcessor:
"""
if suffix not in ["_STARTING", "_ENDING"]:
raise Exception("Suffix should be one of _STARTING or _ENFING")
raise Exception("Suffix should be one of _STARTING or _ENDING")
return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
if suffix == "_STARTING":
starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS].copy()
return pd.concat([starting_cols, fixed_cols], axis=1)
return self.data[ENDING_SUFFIX_COMPONENT_COLS].copy().add_suffix(suffix)
def get_fixed_features(self) -> pd.DataFrame:
"""
@ -529,125 +539,33 @@ class DataProcessor:
return df
@classmethod
def difference_data(cls, df: pd.DataFrame):
@staticmethod
def calculate_days_to(lodgement_date):
"""
Given a dataframe and starting and ending columns, this function will convert the features to
differenced the ending subtract the starting value, which is useful for modelling the difference responces
"""
if isinstance(lodgement_date, str):
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).days
# We ensure that the u value columns are co-erced to a numerical format
uvalue_columns = [col for col in df.columns if "thermal_transmittance" in col]
for uvalue_col in uvalue_columns:
df[uvalue_col] = pd.to_numeric(df[uvalue_col])
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
key_columns = [
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE",
"SAP_STARTING", "HEAT_DEMAND_STARTING",
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
"SAP_ENDING", "CARBON_ENDING", "HEAT_DEMAND_ENDING",
"DAYS_TO_STARTING", "DAYS_TO_ENDING"
]
@staticmethod
def clean_missings_after_description_process(df, ignore_cols=None):
missings = pd.isnull(df).sum()
missings = missings[missings > 0]
ignore_cols = FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + key_columns
if ignore_cols:
missings = missings[~missings.index.isin(ignore_cols)]
columns = {x for x in df.columns if x not in ignore_cols}
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
df = pd.get_dummies(df, columns=non_numerical_columns)
# We make sure there is a starting and ending version of the column
diff_columns = []
no_diff_columns = [] # Store for debugging
for col in columns:
if "_ENDING" in col:
# Don't keep the endings
continue
for col in missings.index:
unique_values = df[col].unique()
if True in unique_values or False in unique_values:
df[col] = df[col].fillna(False)
if "none" in unique_values:
df[col] = df[col].fillna("none")
else:
# We have a starting column so check if we have an ending
if col.replace("_STARTING", "") + "_ENDING" in columns:
diff_columns.append(col)
else:
no_diff_columns.append(col)
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
raise Exception("Something went wrong, potentially missed a differencing column")
datatypes = df.dtypes
# Note: We also difference columns like floor area and floor height. We should experiement with this.
# Starting floor area will heavily impact the starting sap value so that feature may be encapsulated by
# the starting value, therefore to explain any differences in the new floor area, it may be enough to
# just consider the difference however we can play around with this.
# Do the differencing
cols_to_append = {}
for starting_col in diff_columns:
base_col = starting_col.replace("_STARTING", "")
if "_STARTING" in starting_col:
ending_col = starting_col.replace("_STARTING", "_ENDING")
else:
ending_col = starting_col + "_ENDING"
if starting_col not in non_numerical_columns:
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
df = df.drop(columns=[starting_col, ending_col])
continue
level_values = list(set(levels[starting_col] + levels[ending_col]))
level_cols = []
for level in level_values:
starting_level_col = "_".join([starting_col, str(level)])
ending_level_col = "_".join([ending_col, str(level)])
if starting_level_col not in df.columns:
# We have no starting, just ending
col_type = datatypes[ending_level_col].name
if col_type == "bool":
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
else:
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
level_cols.append(ending_level_col)
elif ending_level_col not in df.columns:
# We have no ending, just starting
col_type = datatypes[starting_level_col].name
if col_type == "bool":
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
else:
cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
level_cols.append(starting_level_col)
else:
col_type = datatypes[starting_level_col].name
if col_type == "bool":
cols_to_append[f"{base_col}_{level}_DIFF"] = (
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
)
else:
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
level_cols.extend([starting_level_col, ending_level_col])
# Drop the columns
df = df.drop(columns=level_cols)
cols_to_append = pd.DataFrame(cols_to_append)
df = pd.concat([df, cols_to_append], axis=1)
# Perform a final coercing of string True/False columns to boolean
df = cls.coerce_boolean_columns(df, cols_to_ignore=key_columns)
df[col] = df[col].fillna("Unknown")
return df

View file

@ -4,25 +4,24 @@ from tqdm import tqdm
import msgpack
from pathlib import Path
from model_data.simulation_system.core.Settings import (
from etl.epc.settings import (
MANDATORY_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
EARLIEST_EPC_DATE,
CARBON_RESPONSE,
)
from model_data.simulation_system.core.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
from etl.epc.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
from recommendations.rdsap_tables import england_wales_age_band_lookup
from recommendations.recommendation_utils import (
get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter,
get_wall_type
)
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
def get_cleaned():
@ -364,21 +363,6 @@ def make_uvalues(df):
return df
def clean_missings_after_description_process(df):
missings = pd.isnull(df).sum()
missings = missings[missings > 0]
for col in missings.index:
unique_values = df[col].unique()
if True in unique_values or False in unique_values:
df[col] = df[col].fillna(False)
if "none" in unique_values:
df[col] = df[col].fillna("none")
else:
df[col] = df[col].fillna("Unknown")
return df
def app():
# Get all the files in the directory
@ -400,6 +384,8 @@ def app():
data_processor = DataProcessor(filepath=filepath)
df = data_processor.pre_process()
df[df["WALLS_DESCRIPTION"].str.contains("Cavity")]["WALLS_DESCRIPTION"].unique()
cleaning_averages = data_processor.make_cleaning_averages()
# We have some odd cases with missing constituency so we fill
@ -512,12 +498,11 @@ def app():
# Add some temporal features - we look at the days from the standard starting point in time
# for the starting and ending date so all records are from a fixed point
data_by_urpn_df["DAYS_TO_STARTING"] = (
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
data_by_urpn_df["DAYS_TO_ENDING"] = (
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_STARTING"])
data_by_urpn_df["DAYS_TO_ENDING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_ENDING"])
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
@ -544,7 +529,7 @@ def app():
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
# need to
data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
if pd.isnull(data_by_urpn_df).sum().sum():
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
@ -564,6 +549,12 @@ def app():
output = pd.concat(dataset)
# Remove any records that have huge swings in their floor area
output["tfa_diff_abs"] = abs(output["TOTAL_FLOOR_AREA_ENDING"] - output["TOTAL_FLOOR_AREA_STARTING"])
output["tfa_diff_prop"] = output["tfa_diff_abs"] / output["TOTAL_FLOOR_AREA_STARTING"]
output = output[output["tfa_diff_prop"] < 0.5]
output = output.drop(columns=["tfa_diff_abs", "tfa_diff_prop"])
uvalue_columns = [col for col in output.columns if "thermal_transmittance" in col]
for uvalue_col in uvalue_columns:
output[uvalue_col] = pd.to_numeric(output[uvalue_col])
@ -571,15 +562,7 @@ def app():
save_dataframe_to_s3_parquet(
df=output,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_without_differencing.parquet",
)
output = DataProcessor.difference_data(output)
save_dataframe_to_s3_parquet(
df=output,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/dataset_with_differencing.parquet",
file_key="sap_change_model/dataset.parquet",
)

View file

@ -133,28 +133,6 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
def ordinal(n):
if 10 <= n % 100 <= 20:
suffix = "th"
else:
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
return str(n) + suffix
FLOOR_LEVEL_MAP = {
"Basement": -1,
"Ground": 0,
"ground floor": 0,
"20+": 20,
"21st or above": 21,
**{str(i).zfill(2): i for i in range(0, 21)},
**{ordinal(i): i for i in range(-1, 21)},
**{str(i): i for i in range(-1, 21)},
**{i: i for i in range(-1, 21)},
}
BUILT_FORM_REMAP = {
"Enclosed End-Terrace": "End-Terrace",
"Enclosed Mid-Terrace": "Mid-Terrace",
@ -212,10 +190,66 @@ fill_na_map = {
"NUMBER_OPEN_FIREPLACES": 0
}
# After the property descriptions have been re-remapped, we expect these features to be fixed
FIXED_DESCRIPTON_MAPPED_FEATURES = [
'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
################################################################################################
# These are the features we need for scoring
# We'll likely change how we do this in the future
################################################################################################
STARTING_SUFFIX_COMPONENT_COLS = [
"SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
"GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
]
NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
'is_solid', 'another_property_below', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
'energy_recovery',
'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control',
'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type',
]
ENDING_SUFFIX_COMPONENT_COLS = [
'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
]

View file

@ -4,16 +4,16 @@ from collections import defaultdict
import pandas as pd
from model_data.utils import correct_spelling
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from model_data.epc_attributes.RoofAttributes import RoofAttributes
from model_data.epc_attributes.WallAttributes import WallAttributes
from model_data.epc_attributes.WindowAttributes import WindowAttributes
from model_data.epc_attributes.LightingAttributes import LightingAttributes
from etl.epc_clean.utils import correct_spelling
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
class EpcClean:
@ -130,7 +130,7 @@ class EpcClean:
self.cleaned[field].append(
{
"original_description": description,
"clean_description": cln.description.capitalize(),
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
**cln.process()
}
)

View file

@ -3,8 +3,8 @@ import os
import pandas as pd
import msgpack
from model_data.EpcClean import EpcClean
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from etl.epc_clean.EpcClean import EpcClean
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
from utils.s3 import save_data_to_s3
@ -19,7 +19,7 @@ LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
]
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
EPC_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
@ -27,7 +27,7 @@ ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
def app():
"""
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
and produce a dataset of cleaned fields so that when we get new epc, we can quickly
sanitise any description data
Currently, this application is just run on a local machine
@ -36,9 +36,6 @@ def app():
cleaned_data = {}
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
for directory in tqdm(epc_directories):
directory_destructured = str(directory).split("/")[-1].split("-")
gss_code = directory_destructured[1]
local_authority = directory_destructured[2]
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
@ -62,14 +59,6 @@ def app():
new_data = [x for x in data if x["original_description"] not in existing_descriptions]
cleaned_data[k].extend(new_data)
# TODO: Add property age band into this
# uvalue_estimates = UvalueEstimations(data=data)
# uvalue_estimates.get_estimates(cleaner=cleaner)
# # TODO: Store these to a s3
# uvalue_estimates.walls
# uvalue_estimates.floors
# uvalue_estimates.roofs
# Basic check to make sure all descriptions are unique
for _, cleaned in cleaned_data.items():
descriptions = [x["original_description"] for x in cleaned]

View file

@ -1,7 +1,7 @@
import re
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
class FloorAttributes(Definitions):

View file

@ -1,6 +1,6 @@
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
class HotWaterAttributes(Definitions):

View file

@ -1,6 +1,6 @@
import re
from model_data.epc_attributes.attribute_utils import clean_description
from model_data.utils import correct_spelling
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
from etl.epc_clean.utils import correct_spelling
class LightingAttributes:
@ -27,7 +27,7 @@ class LightingAttributes:
lel_match2 = re.search(r"goleuadau ynni-isel mewn (\d+)%? o'r mannau gosod", self.description)
if lel_match is not None or lel_match2 is not None:
# Perform the actual translation
percentage = lel_match.group(1) if lel_match is not None else lel_match2.group(1)
self.description = f"low energy lighting in {percentage}% of fixed outlets"

View file

@ -1,6 +1,6 @@
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword
class MainFuelAttributes(Definitions):

View file

@ -1,5 +1,5 @@
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
from typing import Dict, Union

View file

@ -1,6 +1,6 @@
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description, find_keyword
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description, find_keyword
class MainheatControlAttributes(Definitions):

View file

@ -1,7 +1,7 @@
import re
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
class RoofAttributes(Definitions):

View file

@ -1,7 +1,7 @@
import re
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import (
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import (
extract_component_types,
extract_thermal_transmittance
)

View file

@ -1,6 +1,6 @@
from typing import Dict, Union
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
class WindowAttributes(Definitions):

View file

@ -0,0 +1,21 @@
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
all_cleaner_map = {
'floor-description': FloorAttributes,
'hotwater-description': HotWaterAttributes,
'main-fuel': MainFuelAttributes,
'mainheat-description': MainHeatAttributes,
'mainheatcont-description': MainheatControlAttributes,
'roof-description': RoofAttributes,
'walls-description': WallAttributes,
'windows-description': WindowAttributes,
'lighting-description:': LightingAttributes,
}

View file

@ -1,5 +1,5 @@
import pytest
import model_data.epc_attributes.attribute_utils as attribute_utils
import etl.epc_clean.epc_attributes.attribute_utils as attribute_utils
def test_extract_thermal_transmittance():

View file

@ -1,6 +1,6 @@
import pytest
import pickle
from model_data.EpcClean import EpcClean
from etl.epc_clean.EpcClean import EpcClean
from pathlib import Path
# For local testing

View file

@ -1,6 +1,6 @@
import pytest
from model_data.tests.test_data.test_floor_attributes_cases import clean_floor_cases
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.tests.test_data.test_floor_attributes_cases import clean_floor_cases
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
class TestCleanFloor:

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from model_data.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.tests.test_data.test_hot_water_attributes_cases import hotwater_cases
class TestHotWaterAttributes:

View file

@ -1,7 +1,7 @@
import pandas as pd
import pytest
from model_data.tests.test_data.test_lighting_attributes_cases import test_cases
from model_data.epc_attributes.LightingAttributes import LightingAttributes
from etl.epc_clean.tests.test_data.test_lighting_attributes_cases import test_cases
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
# An example averages dataset to use in tests. It is a dictionary where the key is a lighting description and the
# value is the expected proportion.

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.MainFuelAttributes import MainFuelAttributes
from model_data.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.tests.test_data.test_main_fuel_attributes_cases import mainfuel_cases
class TestMainHeatControlAttributes:

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.MainheatAttributes import MainHeatAttributes
from model_data.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.tests.test_data.test_mainheat_attributes_cases import mainheat_cases
class TestMainHeatAttributes:

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from model_data.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.tests.test_data.test_mainheat_control_attributes_cases import mainheat_control_cases
class TestMainHeatControlAttributes:

View file

@ -1,7 +1,7 @@
import pytest
from pathlib import Path
from model_data.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
from model_data.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.tests.test_data.test_roof_attributes_cases import clean_roof_test_cases
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
# For local testing
if __file__ == "<input>":

View file

@ -1,4 +1,4 @@
from model_data.utils import is_percentage_or_number, correct_spelling
from etl.epc_clean.utils import is_percentage_or_number, correct_spelling
class TestUtils:

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.WallAttributes import WallAttributes
from model_data.tests.test_data.test_wall_attributes_cases import wall_cases
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.tests.test_data.test_wall_attributes_cases import wall_cases
class TestWallAttributes:

View file

@ -1,6 +1,6 @@
import pytest
from model_data.epc_attributes.WindowAttributes import WindowAttributes
from model_data.tests.test_data.test_window_attributes_cases import windows_cases
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.tests.test_data.test_window_attributes_cases import windows_cases
class TestWindowAttributes:

View file

@ -1,6 +1,6 @@
import pandas as pd
from unittest.mock import patch, call
from model_data.LandRegistryClient import LandRegistryClient
from etl.land_registry.LandRegistryClient import LandRegistryClient
class TestLandRegistryClient:

View file

@ -0,0 +1,54 @@
"""
This is a simple application which estimates some of the basic dimensions of a property based on EPC
data which we can use as a proxy value if we don't have this information on the EPC
"""
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from etl.epc.settings import EARLIEST_EPC_DATE
from etl.epc.DataProcessor import DataProcessor
from BaseUtility import Definitions
from utils.s3 import save_dataframe_to_s3_parquet
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
def app():
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
for directory in tqdm(directories):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["UPRN"])]
data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: DataProcessor.clean_construction_age_band(x)
)
data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
df = (
data.groupby(GROUPBY)
.agg({"NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"})
.reset_index()
)
local_authority = data["LOCAL_AUTHORITY"].unique()
if len(local_authority) > 1:
raise Exception("More than one la in data")
local_authority = local_authority[0]
save_dataframe_to_s3_parquet(
df=df,
bucket_name=BUCKET,
file_key=f"property_dimensions/{local_authority}.parquet",
)

View file

@ -56,7 +56,7 @@ class BoreholeClient:
# EXAMPLE
# There are ~1.4 million entries in this dataset and so we firstly want to reduce the number of
# entries in here if possible before we produce any form of comparison between our properties, to infer
# entries in here if possible before we produce any form of comparison between our epc, to infer
# the distance from the property to the nearest borehole
# Let's take a sample

View file

@ -1,12 +1,55 @@
from enum import Enum
import boto3
import os
import tempfile
import geopandas as gpd
import numpy as np
from enum import Enum
from shapely.geometry import Point
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
def read_shapefile_from_s3(bucket_name, s3_file_key):
"""
Read a shapefile from S3 into a GeoDataFrame.
:param bucket_name: The name of the S3 bucket
:param s3_file_key: The file path of the shape file
:return: GeoDataFrame containing the shapefile data
"""
s3_folder_key = "/".join(s3_file_key.split("/")[:-1])
shape_file_key = s3_file_key.split("/")[-1]
# Create a temporary directory
with tempfile.TemporaryDirectory() as tmpdirname:
s3_client = boto3.client('s3')
# Ensure the temporary directory exists
logger.info("Creating temporary directory at %s" % tmpdirname)
os.makedirs(tmpdirname, exist_ok=True)
# List all files in the given S3 folder
s3_objects = s3_client.list_objects(Bucket=bucket_name, Prefix=s3_folder_key)['Contents']
# Download each file to the temporary directory
for s3_object in s3_objects:
file_key = s3_object['Key']
file_name = os.path.basename(file_key)
local_file_path = os.path.join(tmpdirname, file_name)
# Explicitly create the temporary file
with open(local_file_path, 'wb') as tmpfile:
s3_client.download_fileobj(bucket_name, file_key, tmpfile)
# Read the shapefile from the temporary directory into a GeoDataFrame
shapefile_path = os.path.join(tmpdirname, shape_file_key)
gdf = gpd.read_file(shapefile_path)
return gdf
class ConservationAreaClient:
"""
Class to interact and manupulate convervation area data. The historic england data
@ -18,13 +61,14 @@ class ConservationAreaClient:
"""
SOURCES = ["historic_england"]
IN_CONSERVATION_AREA = "in_conservation_area"
NOT_IN_CONSERVATION_AREA = "not_in_conservation_area"
UNKNOWN = "unknown"
IN_CONSERVATION_AREA = True
NOT_IN_CONSERVATION_AREA = False
UNKNOWN = None
def __init__(self, historic_england_path, gov_path):
def __init__(self, historic_england_path, gov_path, bucket):
self.historic_england_path = historic_england_path
self.gov_path = gov_path
self.bucket = bucket
self.historic_england_data = None
self.gov_data = None
@ -34,11 +78,21 @@ class ConservationAreaClient:
Read the data
"""
logger.info("Reading in historic england conservation area shapefile")
self.historic_england_data = gpd.read_file(self.historic_england_path)
self.historic_england_data = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_path
)
logger.info("Reading in Govenment conservation area geojson")
self.gov_data = gpd.read_file(self.gov_path)
self.gov_data = gpd.read_file(
read_io_from_s3(
bucket_name=self.bucket,
file_key=self.gov_path
)
)
self.gov_data = self.gov_data.drop(columns=["dataset"])
# Convert the gov data to british national grid co-ordinates
self.gov_data = self.gov_data.to_crs("EPSG:27700")
def is_in_conservation_area(self, coordinates: OpenUprnCoordinateData):
@ -71,6 +125,43 @@ class ConservationAreaClient:
else:
return ConservationAreaClient.UNKNOWN
def is_in_conservation_area_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
joined_gdf_he = gpd.sjoin(uprn_gdf, self.historic_england_data, how="left", predicate="within")
# Identify where we have definitive information (not "unknown")
in_conservation_he = ~joined_gdf_he.index_right.isna() & (
joined_gdf_he["NAME"] != "No data available for publication by HE"
)
uprn_in_conservation_he = joined_gdf_he[in_conservation_he]["UPRN"].unique()
# The right index will be missing when we don't have a match so the uprn is not in a conservation
# area
uprn_not_in_conservation_he = joined_gdf_he.loc[
~joined_gdf_he["UPRN"].isin(uprn_in_conservation_he) & joined_gdf_he.index_right.isna(),
"UPRN"
].unique()
# For unknowns, check against government data
unknown_uprns = uprn_gdf.loc[~uprn_gdf["UPRN"].isin(uprn_in_conservation_he)]["UPRN"]
unknown_gdf = uprn_gdf[uprn_gdf["UPRN"].isin(unknown_uprns)]
joined_gdf_gov = gpd.sjoin(unknown_gdf, self.gov_data, how="left", predicate="within")
uprn_in_conservation_gov = joined_gdf_gov.loc[~joined_gdf_gov.index_right.isna(), "UPRN"].unique()
uprn_gdf['conservation_status'] = self.UNKNOWN
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_he), 'conservation_status'
] = self.IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_not_in_conservation_he), 'conservation_status'
] = self.NOT_IN_CONSERVATION_AREA
uprn_gdf.loc[
uprn_gdf["UPRN"].isin(uprn_in_conservation_gov), 'conservation_status'
] = self.IN_CONSERVATION_AREA
return uprn_gdf
def is_in_conservation_area_historic_england(self, x_bng: float, y_bng: float) -> str:
"""
Check if a property is in a conservation area

View file

@ -0,0 +1,118 @@
import os
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
from utils.logger import setup_logger
from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet
logger = setup_logger()
class OpenUprnClient:
"""
This client reads in the Open UPRN data from s3 which can be downloaded from here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
This dataset contains a lookup of UPRNs to coordinates.
Specs for this dataset can be found here:
https://www.ordnancesurvey.co.uk/documents/product-support/tech-spec/open-uprn-techspec-v1.pdf
"""
def __init__(self, path, bucket, uprns=None):
self.path = path
self.bucket = bucket
self.uprns = [int(x) for x in uprns] if uprns else None
self.data = None
# This will be stored in S3 and will be the complete list of filenames
# We'll then use this to determine which file the UPRN's data is contained in
self.filenames = None
def read(self):
"""
This methodology is placeholder, while data sits localls
:return:
"""
logger.info("Reading in open uprn data")
df = pd.read_csv(
read_io_from_s3(
bucket_name=self.bucket,
file_key=self.path
)
)
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df
def read_local(self):
"""
For local testing
:return:
"""
logger.info("Reading in open uprn data")
df = pd.read_csv(self.path)
if self.uprns:
df = df[df["UPRN"].isin(self.uprns)]
self.data = df
def create_file_partitions(self, partition_size=50000):
logger.info("Sorting data by UPRN ascending")
self.data = self.data.sort_values("UPRN", ascending=True)
logger.info("Creating partitions")
self.data['partition'] = self.data.index // partition_size
self.filenames = {}
for partition, group in tqdm(self.data.groupby('partition')):
min_uprn = group['UPRN'].min()
max_uprn = group['UPRN'].max()
self.filenames[partition] = f"{min_uprn}_{max_uprn}.parquet"
self.data['filename'] = self.data['partition'].map(self.filenames)
@staticmethod
def find_filename_for_uprn(uprn, filenames):
for filename in filenames:
min_uprn, max_uprn = map(int, filename.replace(".parquet", "").split("_"))
if min_uprn <= uprn <= max_uprn:
return filename
return None
@staticmethod
def convert_bng_data_to_gpd(df):
gpd_data = gpd.GeoDataFrame(
df,
geometry=gpd.points_from_xy(df.X_COORDINATE, df.Y_COORDINATE),
crs="EPSG:27700" # British National Grid
)
return gpd_data
def save_filenames_to_s3(self, bucket_name):
"""
Save the filenames to s3
:param bucket_name:
:return:
"""
file_key = os.path.join("spatial", "filename_meta.parquet")
filenames = pd.DataFrame({"filenames": list(self.filenames.values())})
filenames[['lower', 'upper']] = filenames['filenames'].str.replace('.parquet', '').str.extract(
'(\d+)_(\d+)'
)
filenames['lower'] = filenames['lower'].astype(int)
filenames['upper'] = filenames['upper'].astype(int)
logger.info("Saving filenames to s3 at {}".format(file_key))
save_dataframe_to_s3_parquet(
df=filenames,
file_key=file_key,
bucket_name=bucket_name
)

48
etl/spatial/README.md Normal file
View file

@ -0,0 +1,48 @@
# Spatial - Geospatial Data Processing Service
## Overview
The Spatial service is designed to read, process, and analyze geospatial data related to
conservation areas and special buildings. It uses datasets from Historic England and the
UK government to determine whether a given UPRN (Unique Property Reference Number) is within
a conservation area or is a listed building. The processed data is saved back to an S3 bucket
in a parquet format for easy retrieval and further analysis.
## Dependencies
Dependencies are listed in requirements.txt. To install them, run:
```
pip install -r requirements.txt
```
## Data Sources
1. **Historic England Conservation Areas**: Shapefile containing polygons of conservation areas.
2. **UK Government Conservation Areas**: GeoJSON file containing polygons of conservation areas.
3. **Open UPRN Data**: CSV file with UPRN and corresponding geospatial data.
4. **Historic England Listed Buildings**: Shapefile with information on listed buildings.
5. **Historic England Heritage Buildings at Risk**: Shapefile with information on heritage buildings at risk.
## Files
- app.py: Main application file that orchestrates the data processing flow.
- ConservationAreaClient.py: Handles reading and processing of conservation area data.
- OpenUprnClient.py: Manages reading and partitioning of Open UPRN data.
- SpecialBuildingsClient.py: Takes care of reading and processing data related to special buildings.
- requirements.txt: Lists all Python package dependencies.
## How to Run
1. Make sure you have all the required packages installed.
2. Update the S3 bucket and file path constants in app.py.
3. Run app.py.
## Workflow
1. Read the datasets for conservation areas and special buildings.
2. Read the Open UPRN dataset and partition it into smaller chunks based on UPRN.
3. For each partition:
- Convert UPRN data to geopandas DataFrame.
- Check if each UPRN is within a conservation area or is a special building.
- Save the processed data back to S3 in parquet format.

View file

@ -0,0 +1,114 @@
import geopandas as gpd
from shapely.geometry import Point
from utils.logger import setup_logger
from etl.spatial.ConservationAreaClient import read_shapefile_from_s3
from datatypes.datatypes import OpenUprnCoordinateData
logger = setup_logger()
class SpecialBuildingsClient:
"""
This class reads in data from Historic England, which can be used to determine if specific buildings are
listed or heritage buildings
"""
def __init__(self, historic_england_listed_buildings_path, historic_england_heritage_buildings_path, bucket):
self.historic_england_listed_buildings_path = historic_england_listed_buildings_path
self.historic_england_heritage_buildings_path = historic_england_heritage_buildings_path
self.bucket = bucket
self.historic_england_listed_buildings = None
self.historic_england_heritage_buildings = None
def read(self):
"""
Read the data
"""
logger.info("Reading in historic england listed buildings shapefile")
self.historic_england_listed_buildings = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_listed_buildings_path
)
logger.info("Reading in historic england heritage buildings shapefile")
self.historic_england_heritage_buildings = read_shapefile_from_s3(
bucket_name=self.bucket, s3_file_key=self.historic_england_heritage_buildings_path
)
# Convert the gov data to british national grid co-ordinates
self.historic_england_heritage_buildings = self.historic_england_heritage_buildings.to_crs("EPSG:27700")
def is_listed_building(self, coordinates: OpenUprnCoordinateData) -> bool:
"""
Check if a location specified by British National Grid coordinates is a listed building.
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return: "listed_building" if the location is within a listed building polygon, "not_listed_building" otherwise
"""
# Convert the coordinates to a Shapely Point object
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
# Check if the point is within any of the listed building polygons
within_listed_buildings = self.historic_england_listed_buildings.contains(point)
if within_listed_buildings.any():
# If the point is within any listed building polygon, log the names of the buildings and return
# "listed_building"
names = self.historic_england_listed_buildings.loc[within_listed_buildings, "Name"]
logger.info(f"The location is within the following listed buildings: {names.values}")
return True
# If the point is not within any listed building polygon, return "not_listed_building"
return False
def is_listed_building_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
# Check against historic England listed buildings data
joined_gdf_listed = gpd.sjoin(uprn_gdf, self.historic_england_listed_buildings, how="left", predicate="within")
# Identify where we have matches
uprn_is_listed = joined_gdf_listed[~joined_gdf_listed.index_right.isna()]["UPRN"].unique()
# Populate the results in the input GeoDataFrame
uprn_gdf['is_listed_building'] = False
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_listed), 'is_listed_building'] = True
return uprn_gdf
def is_heritage_building_at_risk(self, coordinates: OpenUprnCoordinateData) -> bool:
"""
Check if a location specified by British National Grid coordinates is a heritage building at risk.
:param coordinates: dictionary, which should have the OpenUprnCoordinateData format
:return: "heritage_building_at_risk" if the location is within a heritage building at risk polygon,
"not_heritage_building_at_risk" otherwise
"""
# Convert the coordinates to a Shapely Point object
point = Point(coordinates.X_COORDINATE, coordinates.Y_COORDINATE)
# Check if the point is within any of the heritage building at risk polygons
within_heritage_buildings_at_risk = self.historic_england_heritage_buildings.contains(point)
if within_heritage_buildings_at_risk.any():
# If the point is within any heritage building at risk polygon, log the names of the buildings and return
# "heritage_building_at_risk"
names = self.historic_england_heritage_buildings.loc[within_heritage_buildings_at_risk, "EntryName"]
logger.info(f"The location is within the following heritage buildings at risk: {names.values}")
return True
# If the point is not within any heritage building at risk polygon, return "not_heritage_building_at_risk"
return False
def is_heritage_building_at_risk_vectorised(self, uprn_gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
# Check against historic England heritage buildings data
joined_gdf_heritage = gpd.sjoin(
uprn_gdf, self.historic_england_heritage_buildings, how="left", predicate="within"
)
# Identify where we have matches
uprn_is_heritage = joined_gdf_heritage[~joined_gdf_heritage.index_right.isna()]["UPRN"].unique()
# Populate the results in the input GeoDataFrame
uprn_gdf['is_heritage_building'] = False
uprn_gdf.loc[uprn_gdf["UPRN"].isin(uprn_is_heritage), 'is_heritage_building'] = True
return uprn_gdf

0
etl/spatial/__init__.py Normal file
View file

103
etl/spatial/app.py Normal file
View file

@ -0,0 +1,103 @@
"""
This application reads in the open uprn data from a static location and loads it into
our database for querying from other services
"""
import os
from tqdm import tqdm
import pandas as pd
from etl.spatial.ConservationAreaClient import ConservationAreaClient
from etl.spatial.OpenUprnClient import OpenUprnClient
from etl.spatial.SpecialBuildingsClient import SpecialBuildingsClient
from utils.logger import setup_logger
from utils.s3 import save_dataframe_to_s3_parquet
BUCKET = "retrofit-datalake-dev"
OUTPUT_BUCKET = "retrofit-data-dev"
HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME = "spatial/Historic_England_Conservation_Areas/Conservation_Areas.shp"
GOV_CONSERVARION_AREAS_PATHNAME = "spatial/gov-conservation-area.geojson"
OPEN_UPRN_PATHNAME = "spatial/osopenuprn_202309_csv/osopenuprn_202308.csv"
HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME = "spatial/National_Heritage_List_for_England_(" \
"NHLE)/Listed_Building_polygons.shp"
HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME = \
"spatial/Historic_England_Heritage_at_Risk_Register_2022/Historic_England_Heritage_at_Risk_Register_2022.shp"
logger = setup_logger()
def app():
"""
This application uses the conservation area datasets to determine if a UPRN is
in a conservation area or now
We use two sources of data for determining if homes are in conservation areas.
The first is the Historic England dataset, which is a shapefile containing
polygons of conservation areas. The second is the gov.uk dataset, which is a
geojson file containing polygons of conservation areas.
The Historic England dataset can be found here:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The listed building dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The hertitige buildings dataset is also found at Historic England at:
https://historicengland.org.uk/listing/the-list/data-downloads/#aa33785e
The Gov.uk dataset can be found here:
https://www.planning.data.gov.uk/dataset/conservation-area
The open UPRN data can be found here:
https://osdatahub.os.uk/downloads/open/OpenUPRN
The Office for National Statistics Postcode Lookup can be found here:
https://geoportal.statistics.gov.uk/datasets/9ac0331178b0435e839f62f41cc61c16/about
For the moment, these data sources are downloaded manually and uploaded to S3.
This application then processes those files and writes the results to s3
"""
conservation_area_client = ConservationAreaClient(
historic_england_path=HISTORIC_ENGLAND_CONSERVARION_AREAS_PATHNAME,
gov_path=GOV_CONSERVARION_AREAS_PATHNAME,
bucket=BUCKET
)
conservation_area_client.read()
special_buildings_client = SpecialBuildingsClient(
historic_england_listed_buildings_path=HISTORIC_ENGLAND_LISTED_BUILDINGS_PATHNAME,
historic_england_heritage_buildings_path=HISTORIC_ENGLAND_HERITAGE_BUILDINGS_PATHNAME,
bucket=BUCKET
)
special_buildings_client.read()
open_uprn_client = OpenUprnClient(
path=OPEN_UPRN_PATHNAME,
bucket=BUCKET
)
open_uprn_client.read()
# We want to sort the data and split it into filenames on UPRN.
# We'll split the data into chunks of 50,000
open_uprn_client.create_file_partitions()
logger.info("Extracting spatial data for uprn partitions")
to_loop_over = open_uprn_client.data.groupby("filename")
for filename, uprn_df in tqdm(open_uprn_client.data.groupby("filename"), total=len(to_loop_over)):
uprn_gdf = OpenUprnClient.convert_bng_data_to_gpd(uprn_df)
uprn_gdf = conservation_area_client.is_in_conservation_area_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_listed_building_vectorised(uprn_gdf=uprn_gdf)
uprn_gdf = special_buildings_client.is_heritage_building_at_risk_vectorised(uprn_gdf=uprn_gdf)
# Convert back to a regular dataframe
uprn_gdf = uprn_gdf.drop(columns=["geometry"])
uprn_gdf = pd.DataFrame(uprn_gdf)
save_dataframe_to_s3_parquet(
df=uprn_gdf, file_key=os.path.join("spatial", filename), bucket_name=OUTPUT_BUCKET
)
# We finally save the filesnames to s3
open_uprn_client.save_filenames_to_s3(bucket_name=OUTPUT_BUCKET)

View file

@ -1,5 +1,5 @@
import pytest
from model_data.BoreholeClient import BoreholeClient
from etl.spatial.BoreholeClient import BoreholeClient
@pytest.fixture

View file

View file

@ -1,5 +1,5 @@
"""
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
This script produces the dataset used to model the wall area of epc, which is used to estimate the cost
of insulation measures within homes
"""
import os

View file

@ -83,7 +83,7 @@ resource "aws_db_instance" "default" {
publicly_accessible = true
}
# Set up the bucket that recieve the csv uploads of properties to be retrofit
# Set up the bucket that recieve the csv uploads of epc to be retrofit
module "s3_presignable_bucket" {
source = "./modules/s3_presignable_bucket"
bucketname = "retrofit-plan-inputs-${var.stage}"

12
input_property_list.csv Normal file
View file

@ -0,0 +1,12 @@
address,postcode,Notes,,,,
28 Distillery Wharf,W6 9bf,,,,,
Flat 14 Godley V C House,E2 0LP,,,,,
49 Elderfield Road,E5 0LF,,,,,
26 Stanhope Road,N6 5NG,,,,,
Flat 3 Frederick Building,N1 4BD,,,,,
Flat 4 Frederick Building,N1 4BD,,,,,
"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
"Flat 39, 239 Long Lane",SE1 4PT,,,,,
"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
1 address postcode Notes
2 28 Distillery Wharf W6 9bf
3 Flat 14 Godley V C House E2 0LP
4 49 Elderfield Road E5 0LF
5 26 Stanhope Road N6 5NG
6 Flat 3 Frederick Building N1 4BD
7 Flat 4 Frederick Building N1 4BD
8 Flat 28, 22 Adelina Grove E1 3BX
9 Flat 39, 239 Long Lane SE1 4PT
10 1, Westview, Somerby LE14 2QH This property has an unfilled cavity
11 59, Ashdale CM23 4EB This property has a partially filled cavity
12 88 Cleveland Avenue DL3 7BE This property has a filled cavity

View file

@ -1,49 +0,0 @@
# Environment setup
We're using conda to manage environments to circumvent the
issues with Mac M1. This documentation will also cover Pycharm setup.
We're working in python 3.10 so
```commandline
conda create -n hestia-data python=3.10
```
Then activate the environment
```commandline
conda activate hestia-data
```
To set up with Pycharm, run
```commandline
which python
```
and grab the path to the python executable. Then in Pycharm, go to
Settings > Project > Python Interpreter and click the gear icon
to add a new interpreter. Select Conda and either paste the path to the python executable
and click OK, or select the conda environment from the dropdown.
You may need to restart Pycharm for the new interpreter to be recognised.
To install project dependencies navigate to /model_data and run
```commandline
pip install -r requirements.txt
```
### Running Tests
If you are not in a virtual environment, activate it with
```commandline
conda activate envName
```
Then run
```commandline
pytest --cov-config=model_data/.coveragerc --cov=model_data
```

View file

@ -1,650 +0,0 @@
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from typing import Dict, Optional, List
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from model_data.EpcClean import EpcClean
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tqdm import tqdm
from utils.logger import setup_logger
logger = setup_logger()
class SapModel:
# We want to estimate for making improvements on different property components
RESPONSE = "current-energy-efficiency"
# We could potentially build models by constituency to avoid having too many
# features in the model
BASE_FEATURES = [
"property-type",
"built-form",
"construction-age-band",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
"transaction-type"
]
COMPONENT_FEATURES = [
"walls-description",
"floor-description",
"lighting-description",
"roof-description",
"mainheat-description",
"hotwater-description",
"main-fuel",
"mechanical-ventilation",
"secondheat-description",
"energy-tariff",
"solar-water-heating-flag",
"photo-supply",
"windows-description",
"glazed-type",
"glazed-area",
"multi-glaze-proportion",
# "lighting-description" # Might not need to use this
"low-energy-lighting",
"number-open-fireplaces",
"mainheatcont-description",
"fixed-lighting-outlets-count",
"floor-height",
"floor-level",
"total-floor-area",
"extension-count",
]
CATEGORICAL_COLS = [
"property-type",
"built-form",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
"mainheat-description",
"hotwater-description",
"main-fuel",
"mechanical-ventilation",
"secondheat-description",
"energy-tariff",
"solar-water-heating-flag",
"windows-description",
"glazed-type",
"glazed-area",
"construction-age-band",
"lighting-description",
"mainheatcont-description",
"floor-level",
]
NUMERICAL_COLUMNS = [
"photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces",
"fixed-lighting-outlets-count",
"floor-height",
"total-floor-area",
"extension-count",
]
# For the moment, we store records of the best performing models as a benchmark for future imporvements
BEST_FIT = {
'MAPE': 0.04646530042225876, 'Mean Squared Error': 18.635209563729763,
'Mean Absolute Error': 2.856347408023325, 'R2 Score': 0.800701753826118,
'Explained Variance Score': 0.800701753826118, 'Median Absolute Error': 1.9026758012120197
}
BEST_PREDICT = {
'MAPE': 0.04346083528432316, 'Mean Squared Error': 21.16036509335514,
'Mean Absolute Error': 3.0440540802375833, 'R2 Score': 0.7219965012634312,
'Explained Variance Score': 0.7220620137390414, 'Median Absolute Error': 1.9031967986967828
}
BEST_FINAL = {
'MAPE': 0.04841470773386795, 'Mean Squared Error': 21.323052316630914, 'Mean Absolute Error': 2.988547998636157,
'R2 Score': 0.7633662459299112, 'Explained Variance Score': 0.7633785339028832,
'Median Absolute Error': 1.9487883489495985
}
BUCKET_VARIABLES = [
"number-open-fireplaces", "fixed-lighting-outlets-count", 'extension-count', 'multi-glaze-proportion'
]
def __init__(
self, data: List[Dict],
cleaner: EpcClean,
test_size: Optional[float] = 0.2,
random_state: Optional[int] = None
):
self.df = pd.DataFrame(data)
self.cleaner = cleaner
self.random_state = random_state if random_state is not None else 42
self.test_size = 0.2 if test_size is None else test_size
self.model_data = None
self.train_x = None
self.train_y = None
self.test_x = None
self.test_y = None
self.test_model = None
self.final_model = None
self.fit_error = None
self.predict_error = None
self.final_error = None
self.worst = {
"fit_errors": pd.DataFrame(),
"prediction_errors": pd.DataFrame(),
"fit_x": pd.DataFrame(),
"prediction_x": pd.DataFrame(),
"final_errors": pd.DataFrame(),
"final_x": pd.DataFrame(),
}
self.fit_df = None
self.predict_df = None
self.final_fit_df = None
self.diagnosis = {}
def run(self, plot: bool = False) -> None:
"""
A pipeline method to run all necessary methods in correct order.
:param plot: Boolean to indicate whether to plot the regression
"""
try:
self.create_dataset()
self.fit_model()
if plot:
self.plot_regression(self.fit_df)
except Exception as e:
logger.error("An error occurred during execution.")
logger.error(str(e))
def _merge_with_u_values(
self, model_data: pd.DataFrame, description: str, thermal_transmittance: str
) -> pd.DataFrame:
"""
Utility function to merge u value data with model data
:param model_data: Pandas dataframe which is the main modelling dataset
:param description: Name of the description column for which we're merging u-values onto
:param thermal_transmittance: Name of the thermal transmittance column
:return:
"""
u_values = pd.DataFrame(self.cleaner.cleaned[f"{description}-description"])[
["original_description", thermal_transmittance]].rename(
columns={thermal_transmittance: f"{description}_u_value"}
)
model_data = model_data.merge(
u_values,
how="left",
left_on=f"{description}-description",
right_on="original_description"
).drop(columns=["original_description"])
return model_data
def _append_cleaned_data(self, model_data: pd.DataFrame) -> pd.DataFrame:
"""
Appends cleaned data into the model data.
:param model_data: Original model data.
:return: Model data with cleaned data appended.
"""
for description in ["walls", "floor", "roof"]:
model_data = self._merge_with_u_values(model_data, description, "thermal_transmittance")
# lighting_proportions added separately as it doesn't use the _merge_with_u_values method
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
["original_description", "low_energy_proportion"]]
model_data = model_data.merge(
lighting_proportions,
how="left",
left_on="lighting-description",
right_on="original_description"
).drop(columns=["original_description"])
return model_data
@staticmethod
def _convert_transaction_type(model_data: pd.DataFrame) -> pd.DataFrame:
"""
Converts transaction type to boolean
:param model_data: Model data with transaction type.
:return: Model data with converted transaction type.
"""
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
model_data = model_data.drop(columns=["transaction-type"])
return model_data
@staticmethod
def bucket_and_fill(df: pd.DataFrame, column_name: str, n_bins: int = 10) -> pd.DataFrame:
"""
Simple utility function to bucket up features into bins and then fill any missing values with "NO_RECORD"
:param df: Dataframe of features to be binned
:param column_name: Name of the column to be binned
:param n_bins: Number of bins to use
:return: Dataframe with binned column
"""
# Check if the column is numerical
if np.issubdtype(df[column_name].dtype, np.number):
# Create a new categorical column from numerical one by binning the data
df[column_name + "_bucket"] = pd.cut(df[column_name], bins=n_bins).astype(str)
# Replace missing data with "NO_RECORD"
df[column_name + "_bucket"] = df[column_name + "_bucket"].fillna("NO_RECORD")
df[column_name + "_bucket"] = np.where(
df[column_name + "_bucket"] == "nan",
"NO_RECORD",
df[column_name + "_bucket"]
)
return df
def _clean_numericals(self, model_data):
# Try binning numericals
remaining_numericals = [x for x in self.NUMERICAL_COLUMNS if x not in self.BUCKET_VARIABLES]
for col in self.BUCKET_VARIABLES:
model_data[col] = pd.to_numeric(model_data[col], errors='coerce')
# If all values are missing, set all values to 0 - this column will get dropped
if all(pd.isnull(model_data[col])):
model_data[col + "_bucket"] = "NO_RECORD"
continue
model_data = self.bucket_and_fill(model_data, col)
# Replace the data with the binned version
model_data = model_data.drop(columns=self.BUCKET_VARIABLES)
model_data = model_data.rename(
columns=dict(zip([c + "_bucket" for c in self.BUCKET_VARIABLES], self.BUCKET_VARIABLES))
)
# Basic fill the rest of the columns with 0 - currenrtly this provided the best performance
for col in remaining_numericals:
model_data[col] = np.where(
model_data[col] == "", "0", model_data[col]
).astype(float)
return model_data
@staticmethod
def clean_missings(model_data: pd.DataFrame) -> pd.DataFrame:
"""
Fills categorical missing data with sensible values
:param model_data: Original model data.
:return: Model data with cleaned categorical data.
"""
# Cleaning of energy-tariff and construction-age-band hurt prediction performance, indicating there is
# potentially
# a notable difference between a "" missing and a "NO DATA!" missing, worth differentiating
model_data["mechanical-ventilation"] = np.where(
model_data["mechanical-ventilation"] == "", "NO DATA!", model_data["mechanical-ventilation"]
)
model_data["solar-water-heating-flag"] = np.where(
model_data["solar-water-heating-flag"] == "", "N", model_data["solar-water-heating-flag"]
)
model_data["glazed-type"] = np.where(
model_data["glazed-type"] == "", "NO DATA!", model_data["glazed-type"]
)
model_data["glazed-area"] = np.where(
model_data["glazed-area"] == "", "NO DATA!", model_data["glazed-type"]
)
return model_data
def create_dataset(self):
logger.info("Creating modelling dataset")
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
model_data = model_data.reset_index(drop=True)
model_data["idx"] = model_data.index.copy()
# Append on u-values
model_data = self._append_cleaned_data(model_data)
model_data = self.clean_missings(model_data)
# Convert transaction_type
model_data = self._convert_transaction_type(model_data)
# Clean numerical columns
model_data = self._clean_numericals(model_data)
# Take just entries with U-values
# TODO: Rather than doing this, do we want to include the estimated u-values?
# Since this ends up with just 2k entries
model_data = model_data[
~pd.isnull(model_data["walls_u_value"]) &
~pd.isnull(model_data["floor_u_value"]) &
~pd.isnull(model_data["roof_u_value"])
]
exclude_features = [
"walls-description", "floor-description", "roof-description", "transaction-type"
]
features = [
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE, "idx", "is_rdsap"
] if x not in exclude_features
]
model_data = model_data[features]
for col in self.CATEGORICAL_COLS:
model_data[col] = model_data[col].astype('category')
# Convert response
model_data[self.RESPONSE] = model_data[self.RESPONSE].astype(float)
self.model_data = model_data
def make_training_test(self, x):
# Split into training and test
self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(
x.drop(self.RESPONSE, axis=1),
x[self.RESPONSE],
test_size=self.test_size,
random_state=self.random_state
)
@staticmethod
def remove_zero_std_cols(train_x, test_x=None, threshold=1e-3):
"""
Utility function to remove columns that have zero standard deviation from both test and train sets
:param train_x: Training data to remove columns from
:param test_x: If provided, remove the same columns from the test data
:param threshold: float value, if the standard deviation is below this threshold, the column is considered
to have zero standard deviation
:return: Tuple of train_x and test_x (if provided). If test_x is not provided, a null placeholder is returned
"""
# Compute standard deviations
std_devs = train_x.std()
# Find columns with zero or near-zero standard deviation
zero_std_cols = std_devs[std_devs <= threshold].index
# Drop these columns from the training data
train_x = train_x.drop(zero_std_cols, axis=1)
if test_x is not None:
# Ensure the test data has the same columns
test_x = test_x[train_x.columns]
return train_x, test_x
return train_x, None
def fit_model(self):
"""
Main function to fit the model and produce accuracy metrics
"""
x = pd.get_dummies(self.model_data, columns=self.CATEGORICAL_COLS + self.BUCKET_VARIABLES, drop_first=True)
# Convert booleans to integer
for col in x.columns:
if x[col].dtype == bool:
x[col] = x[col].astype(int)
if x[col].dtype == object:
x[col] = x[col].astype(float)
# Create the training and test sets for each run
self.make_training_test(x)
self.train_x, self.test_x = self.remove_zero_std_cols(self.train_x, self.test_x)
logger.info("Detecting multi-collinearity in training dataset")
self.detect_multi_collinearity()
# Add a constant to the independent value
train_x = sm.add_constant(self.train_x)
test_x = sm.add_constant(self.test_x)
train_idx = train_x["idx"].copy()
test_idx = self.test_x["idx"].copy()
train_x = train_x.drop(columns=["idx"])
test_x = test_x.drop(columns=["idx"])
logger.info("Fitting testing model")
# make regression model
model = sm.OLS(self.train_y, train_x)
# fit model and print results
self.test_model = model.fit()
train_predictions = self.test_model.fittedvalues
test_predictions = self.test_model.predict(test_x)
self.fit_error, self.worst["fit_errors"] = self.calculate_regression_metrics(
y_true=self.train_y, y_pred=train_predictions
)
# Predict on new data
self.predict_error, self.worst["prediction_errors"] = self.calculate_regression_metrics(
y_true=self.test_y, y_pred=test_predictions
)
fit_success = self.check_successes(self.fit_error, self.BEST_FIT)
predict_success = self.check_successes(self.predict_error, self.BEST_PREDICT)
self.model_data['fit'] = self.test_model.fittedvalues
# The worst errors over index heavily for flats
self.worst["fit_x"] = self.model_data[self.model_data.index.isin(self.worst["fit_errors"].index)]
self.worst["prediction_x"] = self.model_data[self.model_data.index.isin(self.worst["prediction_errors"].index)]
self.fit_df = pd.DataFrame(
{
"fit": train_predictions,
"actual": self.train_y,
"idx": train_idx
}
).sort_values("actual", ascending=True)
self.predict_df = pd.DataFrame(
{
"predictions": test_predictions,
"actual": self.test_y,
"idx": test_idx
}
)
self.diagnosis = {
"fit_success": fit_success,
"predict_success": predict_success,
"summary": self.test_model.summary()
}
# We're now ready to fit the final model
# For the momeent, the pre-processing at the top of this function merely removes columns, so we
# just need to remove the columns that were removed from the training data from the final model
logger.info("Fitting final model")
x = sm.add_constant(x)
y = x[self.RESPONSE]
x = x[self.train_x.columns]
idx = x["idx"].copy()
x = x.drop(columns=["idx"])
final_model = sm.OLS(y, x)
# fit model and print results
self.final_model = final_model.fit()
final_predictions = self.final_model.fittedvalues
self.final_error, self.worst["final_errors"] = self.calculate_regression_metrics(
y_true=y, y_pred=final_predictions
)
self.final_fit_df = pd.DataFrame(
{
"fit": final_predictions,
"actual": y,
"idx": idx
}
).sort_values("actual", ascending=True)
@staticmethod
def check_successes(experiment_error, best_error):
"""
Simple function to check if the experiment error is better than the best error
:param experiment_error: output of calculate_regression_metrics() on the experiment
:param best_error: Current benchmark best error
:return:
"""
successes = []
for k in experiment_error:
if k in ["Explained Variance Score", "R2 Score"]:
# We want to maximise this so we want experiment error to be higher
successes.append(
{
"measure": k,
"success": experiment_error[k] >= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
continue
successes.append(
{
"measure": k,
"success": experiment_error[k] <= best_error[k],
"difference": abs(experiment_error[k] - best_error[k])
}
)
return pd.DataFrame(successes)
def rf_importance(self, train_x, train_y, test_x, test_y):
"""
Utility function to estimate feature importance using a random forest
This is useful to get a sense of some of the key features which are driving model
performance
:param train_x: Training data covariates to build the importance model on
:param train_y: Training data response to build the importance model on
:param test_x: Test data covariates to build the permutation importance model on
:param test_y: Test data response to build the permutation importance model on
:return: Pandas dataframe of feature importances, ranked by most important to least
"""
rf = RandomForestRegressor(random_state=self.random_state)
rf.fit(train_x, train_y)
# Print the name and importance of each feature
rf_importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
rf_importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)
rf_importance_df = pd.DataFrame(rf_importance_df)
rf_importance_df = rf_importance_df.sort_values(by="rf_importance", ascending=False)
perm_importance = self.permuation_importance(rf, test_x, test_y)
return rf_importance_df, perm_importance
@staticmethod
def permuation_importance(rf, test_x, test_y):
"""
Simple utility function to produce permutation importance for a given model\
:param rf: Random forest model to calculate permutation importance for
:param test_x: Test covariates to be used for permutation importance
:param test_y: Test response to be used for permutation importance
:return:
"""
perm_importance = permutation_importance(rf, test_x, test_y, scoring='neg_mean_squared_error')
perm_importance_df = pd.DataFrame(
{
"Feature": test_x.columns,
"perm_importance": perm_importance.importances_mean
}
).sort_values(by="perm_importance", ascending=False)
return perm_importance_df
def detect_multi_collinearity(self):
# Get the VIFs for each variable
vifs = pd.DataFrame()
vifs["features"] = self.train_x.columns
vifs["vif"] = [variance_inflation_factor(self.train_x.values, i) for i in tqdm(range(self.train_x.shape[1]))]
# Get the features with the highest VIF
vifs = vifs.sort_values("vif", ascending=False)
# There are some features, we do not want to remove
required_features = [
"walls_u_value", "floor_u_value", "roof_u_value", "idx", "is_rdsap"
]
vifs = vifs[~vifs["features"].isin(required_features)]
drop_vifs = vifs[np.isinf(vifs["vif"])]
# Acceptable drop variables:
# main-fuel_Gas: mains gas
# glazed-type_NO DATA!
# glazed-area_NO DATA!
self.train_x = self.train_x.drop(columns=drop_vifs["features"].values)
self.test_x = self.test_x[self.train_x.columns]
@staticmethod
def plot_regression(df):
# Extract the "fit" and "actual" columns from the dataframe
fit = df['fit']
actual = df['actual']
# Create an array of x-values (assumed to be sequential integers)
x = np.arange(len(df))
# Plot the fit and actual data
plt.plot(x, fit, color='red', label='Fit')
plt.plot(x, actual, color='blue', label='Actual')
# Set labels and title
plt.xlabel('Index')
plt.ylabel('Value')
plt.title('Linear Regression - Fit vs Actual')
# Display legend
plt.legend()
# Show the plot
plt.show()
@staticmethod
def calculate_regression_metrics(y_true, y_pred, n=20):
"""
Calculate the 5 most important accuracy metrics for regression.
Args:
y_true (array-like): Array of true target values.
y_pred (array-like): Array of predicted target values.
Returns:
dict: Dictionary containing the calculated metrics.
"""
metrics = {
'MAPE': mean_absolute_percentage_error(y_true, y_pred),
'Mean Squared Error': mean_squared_error(y_true, y_pred),
'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
'R2 Score': r2_score(y_true, y_pred),
'Explained Variance Score': explained_variance_score(y_true, y_pred),
'Median Absolute Error': median_absolute_error(y_true, y_pred)
}
errors = pd.DataFrame()
errors['Fit'] = y_true
errors['Actual'] = y_pred
errors['Residual'] = errors['Actual'] - errors['Fit']
errors['Absolute Residual'] = np.abs(errors['Residual'])
worst_errors = errors.nlargest(n, 'Absolute Residual')
return metrics, worst_errors

View file

@ -1,207 +0,0 @@
import pickle
import pandas as pd
import numpy as np
from model_data.EpcClean import EpcClean
class UvalueEstimations:
def __init__(self, data: list):
"""
Initialize the UvalueEstimations class.
:param data: The input data as a list of dictionaries, to be converted to a dataframe
"""
self.data = pd.DataFrame(data)
self.walls = None
self.walls_decile_data = {}
self.roofs = None
self.floors = None
self.floors_decile_data = {}
def get_estimates(self, cleaner: EpcClean):
"""
Calculate U-value estimates for walls, roofs, and floors.
:param cleaner: An instance of the EpcClean class used for cleaning data.
"""
self.set_walls(cleaner)
self.set_roofs(cleaner)
self.set_floors(cleaner)
def set_walls(self, cleaner: EpcClean):
"""
Set U-value estimates for walls.
:param cleaner: An instance of the EpcClean class used for cleaning data.
"""
walls_columns = [
"local-authority", "property-type", "walls-description", "walls-energy-eff", "walls-env-eff", "built-form",
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
]
walls_df = self.data[self.data["walls-description"].str.contains("Average thermal transmittance")]
# Take just the columns we want
walls_df = walls_df[walls_columns]
walls_df["total-floor-area"] = walls_df["total-floor-area"].astype(float)
walls_df, decile_labels, decile_boundaries = self.classify_into_deciles(walls_df, "total-floor-area")
# We now get the U-values
walls_df = walls_df.merge(
pd.DataFrame(cleaner.cleaned['walls-description'])[["original_description", "thermal_transmittance"]],
how="left",
right_on="original_description",
left_on="walls-description"
)
u_value_summary = walls_df.groupby(
[
"local-authority",
"property-type",
"walls-energy-eff",
"walls-env-eff",
"built-form",
"number-habitable-rooms",
"number-heated-rooms",
"total-floor-area_group"
],
observed=True
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
u_value_summary.columns = [
"local-authority",
"property-type",
"walls-energy-eff",
"walls-env-eff",
"built-form",
"number-habitable-rooms",
"number-heated-rooms",
"total-floor-area_group",
"median_thermal_transmittance",
"n_samples"
]
self.walls = u_value_summary
self.walls_decile_data = {
"decile_labels": decile_labels,
"decile_boundaries": decile_boundaries
}
def set_roofs(self, cleaner: EpcClean):
"""
Set U-value estimates for roofs.
:param cleaner: An instance of the EpcClean class used for cleaning data.
"""
pass
def set_floors(self, cleaner: EpcClean):
"""
Set U-value estimates for floors.
:param cleaner: An instance of the EpcClean class used for cleaning data.
"""
floors_columns = [
"local-authority", "property-type", "floor-description", "floor-energy-eff", "floor-env-eff",
"built-form",
"total-floor-area", "number-habitable-rooms", "number-heated-rooms"
]
floors_df = self.data[self.data["floor-description"].str.contains("Average thermal transmittance")]
# Take just the columns we want
floors_df = floors_df[floors_columns]
floors_df["total-floor-area"] = floors_df["total-floor-area"].astype(float)
floors_df, decile_labels, decile_boundaries = self.classify_into_deciles(floors_df, "total-floor-area")
# We now get the U-values
floors_df = floors_df.merge(
pd.DataFrame(cleaner.cleaned['floor-description'])[["original_description", "thermal_transmittance"]],
how="left",
right_on="original_description",
left_on="floor-description"
)
u_value_summary = floors_df.groupby(
[
"local-authority",
"property-type",
"floor-energy-eff",
"floor-env-eff",
"built-form",
"number-habitable-rooms",
"number-heated-rooms",
"total-floor-area_group"
],
observed=True
).agg({"thermal_transmittance": ["median", "size"]}).reset_index()
u_value_summary.columns = [
"local-authority",
"property-type",
"floor-energy-eff",
"floor-env-eff",
"built-form",
"number-habitable-rooms",
"number-heated-rooms",
"total-floor-area_group",
"median_thermal_transmittance",
"n_samples"
]
self.floors = u_value_summary
self.floors_decile_data = {
"decile_labels": decile_labels,
"decile_boundaries": decile_boundaries
}
@staticmethod
def classify_into_deciles(df: pd.DataFrame, column: str) -> (pd.DataFrame, list, list):
"""
Break a column in a Pandas DataFrame into deciles and classify new values into the existing deciles.
:param df: The input Pandas DataFrame.
:param column: The column name to break into deciles.
:return: A tuple containing:
- The DataFrame with the decile group column.
- The list of decile labels.
- The list of decile boundaries.
"""
# Calculate decile boundaries
decile_boundaries = np.percentile(df[column], np.arange(0, 101, 10))
# Create decile labels
decile_labels = [f"Decile {i + 1}" for i in range(10)]
# Assign decile labels to existing values
df[column + "_group"] = pd.cut(df[column], bins=decile_boundaries, labels=decile_labels,
include_lowest=True)
return df, decile_labels, decile_boundaries
@staticmethod
def classify_decile_newvalues(decile_boundaries, decile_labels, new_values: list) -> list:
"""
Classify new values into existing deciles based on decile definitions.
:param decile_boundaries: The list of decile boundaries.
:param decile_labels: The list of decile labels.
:param new_values: A list of new values to classify.
:return: The classifications for the new values as a list.
"""
# Classify new values based on decile definitions
classifications = pd.cut(new_values, bins=decile_boundaries, labels=decile_labels, include_lowest=True)
return classifications.tolist()
def _save(self, filename):
"""
Useful utility function to store this object, which is particularly handy for unit testing
:return:
"""
with open(filename, 'wb') as f:
pickle.dump(self, f)

View file

@ -1,6 +0,0 @@
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path='model_data/.env')
EPC_AUTH_TOKEN = os.environ.get('EPC_AUTH_TOKEN')

View file

@ -1,29 +0,0 @@
import time
def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdown=0.1):
offset_from = 0
n_completed = 0
results = []
complete = False
while not complete:
if verbose:
print("Pulling for page %s" % str(int(offset_from / page_size) + 1))
time.sleep(slowdown)
search_resp = client.domestic.search(params=params, offset_from=offset_from, size=page_size)
# Note: We can only make 10k queries for a single set of search queries.
# It might make sense to download data via zip for machine learning since we don't need this
# data to be perfectly up to date
if not search_resp:
break
n_completed += 1
results.extend(search_resp["rows"])
if n_completed == n_pages:
complete = True
else:
offset_from += page_size
return results

View file

@ -1,40 +0,0 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def create_heatmap_plots(data, response_var, pivot_var1, pivot_var2, order1=None, order2=None):
"""
Create a heatmap plot based on a list of data and given variables.
:param data: List of dictionaries, input data.
:param response_var: String, response variable to be plotted.
:param pivot_var1: String, first pivot variable to be used in the plot.
:param pivot_var2: String, second pivot variable to be used in the plot.
:param order1: List, the order of categories for pivot_var1. Optional.
:param order2: List, the order of categories for pivot_var2. Optional.
Returns:
None. Displays the generated plot.
"""
# Create a DataFrame from your list of dictionaries
df = pd.DataFrame(data)
# Convert the response variable column to float type if it's not already
df[response_var] = df[response_var].astype(float)
# Create a pivot table
pivot = df.pivot_table(index=pivot_var1, columns=pivot_var2, values=response_var)
# If an order is provided, reorder the pivot table
if order1 is not None:
pivot = pivot.reindex(order1)
if order2 is not None:
pivot = pivot[order2]
# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap='coolwarm')
plt.title(f"Heatmap of {response_var} by {pivot_var1} and {pivot_var2}")
plt.show()

Some files were not shown because too many files have changed in this diff Show more