mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
commit
8b70a71241
22 changed files with 719 additions and 214 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
5
.idea/misc.xml
generated
5
.idea/misc.xml
generated
|
|
@ -3,7 +3,10 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
BIN
etl/customers/stonewater/map_app/assets/hestia-logo.png
Normal file
BIN
etl/customers/stonewater/map_app/assets/hestia-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
1
etl/customers/stonewater/map_app/assets/osmosis-Logo.svg
Normal file
1
etl/customers/stonewater/map_app/assets/osmosis-Logo.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 7.2 KiB |
BIN
etl/customers/stonewater/map_app/assets/stonewater-logo.png
Normal file
BIN
etl/customers/stonewater/map_app/assets/stonewater-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 19 KiB |
0
etl/customers/stonewater/map_app/callbacks.py
Normal file
0
etl/customers/stonewater/map_app/callbacks.py
Normal file
8
etl/customers/stonewater/map_app/config.py
Normal file
8
etl/customers/stonewater/map_app/config.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
import os
|
||||
import json
|
||||
import dotenv
|
||||
|
||||
# When running locally, we'll need to load the .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
MAPBOX_ACCESS_TOKEN = os.getenv("MAPBOX_ACCESS_TOKEN")
|
||||
230
etl/customers/stonewater/map_app/map_page.py
Normal file
230
etl/customers/stonewater/map_app/map_page.py
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
import dash_bootstrap_components as dbc
|
||||
from dash import html, dcc
|
||||
import json
|
||||
import plotly.graph_objects as go
|
||||
import pandas as pd
|
||||
|
||||
from config import MAPBOX_ACCESS_TOKEN
|
||||
|
||||
|
||||
def make_real_epc_piechart(real_epc_breakdown):
|
||||
labels = [x["is_real_epc"] for x in real_epc_breakdown]
|
||||
values = [x["count"] for x in real_epc_breakdown]
|
||||
|
||||
marker_colors = ["#027fa6", "rgb(225 225 225)"]
|
||||
|
||||
fig = go.Figure(
|
||||
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors)],
|
||||
)
|
||||
|
||||
fig.update_layout(margin={"t": 0})
|
||||
|
||||
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
|
||||
|
||||
return plot
|
||||
|
||||
|
||||
def make_epc_rating_piechart(epc_rating_breakdown):
|
||||
# Re-order from G to A
|
||||
epc_rating_breakdown = sorted(epc_rating_breakdown, key=lambda x: x["EPC"])
|
||||
|
||||
labels = [x["EPC"] for x in epc_rating_breakdown]
|
||||
values = [x["count"] for x in epc_rating_breakdown]
|
||||
|
||||
marker_colors = ["#117d58", "#2da55c", "#8dbd40", "#f7cd14", "#f3a96a", "#ef8026", "#e41e3b"]
|
||||
|
||||
fig = go.Figure(
|
||||
data=[go.Pie(labels=labels, values=values, marker_colors=marker_colors, sort=False)],
|
||||
)
|
||||
|
||||
fig.update_layout(margin={"t": 0})
|
||||
|
||||
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
|
||||
|
||||
return plot
|
||||
|
||||
|
||||
def make_map(locations):
|
||||
if not locations:
|
||||
return None
|
||||
|
||||
df = pd.DataFrame(locations)
|
||||
|
||||
# Create custom hover text
|
||||
df['hover_text'] = df.apply(
|
||||
lambda row: f"UPRN: {int(row['uprn'])}<br>Address: {row['standardised_address']}<br>Postcode: "
|
||||
f"{row['standardised_postcode']}<br>Latitude: {row['LATITUDE']}<br>Longitude: {row['LONGITUDE']}",
|
||||
axis=1)
|
||||
|
||||
data = [
|
||||
go.Scattermapbox(
|
||||
lat=df["LATITUDE"].tolist(),
|
||||
lon=df["LONGITUDE"].tolist(),
|
||||
mode="markers",
|
||||
marker=go.scattermapbox.Marker(size=10, color="#027fa6"),
|
||||
text=df["hover_text"], # Use the custom hover text
|
||||
hoverinfo='text'
|
||||
)
|
||||
]
|
||||
|
||||
layout = go.Layout(
|
||||
autosize=True,
|
||||
hovermode="closest",
|
||||
mapbox=go.layout.Mapbox(
|
||||
accesstoken=MAPBOX_ACCESS_TOKEN,
|
||||
bearing=0,
|
||||
center=go.layout.mapbox.Center(lat=53, lon=-1.5),
|
||||
pitch=0,
|
||||
zoom=5,
|
||||
),
|
||||
margin={"t": 0},
|
||||
)
|
||||
|
||||
fig = go.Figure(data=data, layout=layout)
|
||||
|
||||
plot = dcc.Graph(figure=fig, config={"displayModeBar": False})
|
||||
|
||||
return plot
|
||||
|
||||
|
||||
def layout():
|
||||
# Get the data
|
||||
with open("Stonewater Mapping Data.json", "r") as file:
|
||||
locations = json.load(file)
|
||||
|
||||
# Get the EPC breakdown data
|
||||
with open("Stonewater real EPC breakdown.json") as file:
|
||||
real_epc_breakdown = json.load(file)
|
||||
|
||||
# Get the EPC ratings data
|
||||
with open("Stonewater EPC rating breakdown.json") as file:
|
||||
epc_rating_breakdown = json.load(file)
|
||||
|
||||
page = dbc.Container(
|
||||
[
|
||||
dbc.Row(
|
||||
dbc.Col(
|
||||
html.Div(
|
||||
[
|
||||
# Banner with logos
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
html.Img(src="assets/stonewater-logo.png", height="50px"),
|
||||
width="auto"
|
||||
),
|
||||
dbc.Col(
|
||||
html.Img(src="assets/osmosis-Logo.svg", height="50px"),
|
||||
width="auto"
|
||||
),
|
||||
dbc.Col(
|
||||
html.Div(
|
||||
style={"color": "white", "font-size": "1.5rem", "font-weight": "bold"}
|
||||
),
|
||||
width=True,
|
||||
className="text-center"
|
||||
)
|
||||
],
|
||||
className="align-items-center",
|
||||
style={"background-color": "#027fa6", "padding": "10px"}
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col("Powered by", style={"color": "#027fa6", "fontSize": "1rem", 'zIndex': 10},
|
||||
width="auto"),
|
||||
dbc.Col(
|
||||
html.A(
|
||||
html.Img(src="assets/hestia-logo.png", height="50px"),
|
||||
href="https://hestia.homes",
|
||||
),
|
||||
width="auto",
|
||||
style={"margin-left": "-60px"}
|
||||
),
|
||||
],
|
||||
justify='left',
|
||||
align="center"
|
||||
),
|
||||
html.H1(
|
||||
"Stonewater Survey Map",
|
||||
style={"font-size": "2.5rem", "font-weight": "bold", "margin-bottom": "20px"}
|
||||
),
|
||||
html.P(
|
||||
"This map shows the location of the properties that are to be surveyed by Osmosis.",
|
||||
style={"font-size": "1.25rem", "margin-bottom": "40px"}
|
||||
),
|
||||
],
|
||||
className="text-center"
|
||||
),
|
||||
width=12
|
||||
),
|
||||
className="mt-5"
|
||||
),
|
||||
dbc.Row(
|
||||
dbc.Col(
|
||||
make_map(locations=locations),
|
||||
width=10,
|
||||
align="center",
|
||||
className="text-center"
|
||||
),
|
||||
justify="center"
|
||||
),
|
||||
dbc.Row(
|
||||
[
|
||||
dbc.Col(
|
||||
[
|
||||
html.Div(
|
||||
"Breakdown of real EPCs",
|
||||
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
|
||||
className='text-center'
|
||||
),
|
||||
html.Div(
|
||||
"This pie chart shows the proportion of real EPCs in the asset list. Currently, "
|
||||
"there are EPCs for 3736 of the 5245 properties that have a UPRN in the asset list",
|
||||
style={"marginBottom": "1em"}
|
||||
),
|
||||
make_real_epc_piechart(real_epc_breakdown),
|
||||
],
|
||||
width={"size": 5},
|
||||
),
|
||||
dbc.Col(
|
||||
[
|
||||
html.Div(
|
||||
"EPC Ratings for properties with an EPC",
|
||||
style={"fontSize": "1.5rem", "fontWeight": "bold", "marginBottom": "1em"},
|
||||
className='text-center'
|
||||
),
|
||||
html.Div(
|
||||
[
|
||||
"This pie chart shows the breakdown of EPC ratings, for properties that currently "
|
||||
"have an EPC. "
|
||||
"The ratings range from A to G, where surprisingly, there are two EPC properties "
|
||||
"that were initially "
|
||||
"expected by Parity's modelled SAP, to be EPC D or below. These properties can be"
|
||||
" seen ",
|
||||
html.A("here",
|
||||
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
|
||||
"/2708-5001-7327-6090-7284",
|
||||
target="_blank"),
|
||||
" and ",
|
||||
html.A("here",
|
||||
href="https://find-energy-certificate.service.gov.uk/energy-certificate"
|
||||
"/1037-4032-1009-0361-7292",
|
||||
target="_blank"),
|
||||
"."
|
||||
],
|
||||
style={"marginBottom": "1em"}
|
||||
),
|
||||
make_epc_rating_piechart(epc_rating_breakdown),
|
||||
],
|
||||
|
||||
width={"size": 5},
|
||||
),
|
||||
],
|
||||
justify="center"
|
||||
)
|
||||
],
|
||||
fluid=True,
|
||||
className="p-5"
|
||||
)
|
||||
|
||||
return page
|
||||
12
etl/customers/stonewater/map_app/requirements.txt
Normal file
12
etl/customers/stonewater/map_app/requirements.txt
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
dash==2.8.1
|
||||
gunicorn
|
||||
pandas
|
||||
dash-bootstrap-components==1.3.1
|
||||
boto3
|
||||
dropbox
|
||||
Flask-Caching
|
||||
dash-extensions
|
||||
mysql-connector-python
|
||||
sqlalchemy
|
||||
werkzeug==2.3.7
|
||||
python-dotenv
|
||||
45
etl/customers/stonewater/map_app/server.py
Normal file
45
etl/customers/stonewater/map_app/server.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
import logging
|
||||
import secrets
|
||||
|
||||
import dash_bootstrap_components as dbc
|
||||
from dash import html
|
||||
from dash_extensions.enrich import DashProxy, MultiplexerTransform
|
||||
import flask
|
||||
from map_page import layout
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# We just use a simple secret key for the moment
|
||||
|
||||
SECRET_KEY = secrets.token_hex(24)
|
||||
|
||||
|
||||
def init_app():
|
||||
app = DashProxy(
|
||||
__name__,
|
||||
server=flask.Flask(__name__),
|
||||
suppress_callback_exceptions=True,
|
||||
external_stylesheets=[
|
||||
dbc.themes.BOOTSTRAP,
|
||||
dbc.icons.FONT_AWESOME,
|
||||
"https://fonts.googleapis.com/css?family=Comfortaa",
|
||||
],
|
||||
transforms=[MultiplexerTransform()]
|
||||
)
|
||||
|
||||
server = app.server
|
||||
|
||||
# Set app config
|
||||
server.config.update(
|
||||
SECRET_KEY=SECRET_KEY,
|
||||
)
|
||||
|
||||
app.title = "Hesta X Stonewater"
|
||||
|
||||
# Define the layout
|
||||
app.layout = layout()
|
||||
|
||||
return app
|
||||
|
||||
|
||||
app = init_app()
|
||||
8
etl/customers/stonewater/map_app/wsgi.py
Normal file
8
etl/customers/stonewater/map_app/wsgi.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# Callbacks must be imported to run the app
|
||||
import callbacks # NOQA
|
||||
from server import app
|
||||
|
||||
application = app.server
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run_server(port=8080, debug=True, host="0.0.0.0")
|
||||
132
etl/customers/stonewater/outputs 27th June 2024.py
Normal file
132
etl/customers/stonewater/outputs 27th June 2024.py
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
"""
|
||||
This script prepares some outputs for the stonewater project, 27th June 2024
|
||||
|
||||
The work done so far has been data cleaning and clustering.
|
||||
In this script, we do the following things:
|
||||
|
||||
1) Match the clustering data to the archetypes
|
||||
2) Do some basic analysis on the data
|
||||
3) Mapping of the archetypes
|
||||
"""
|
||||
import pandas as pd
|
||||
import json
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
archetyped_asset_list = stonewater_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
|
||||
"archetype_representative", "rank"
|
||||
]
|
||||
].copy()
|
||||
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
|
||||
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
|
||||
# Sort
|
||||
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
|
||||
|
||||
# Read in and merge on clustering features
|
||||
clustering_features = read_pickle_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
)
|
||||
|
||||
# Move property-type and built-form to the first two columns
|
||||
columns_to_move = ['property-type', 'built-form']
|
||||
|
||||
# Get the remaining columns
|
||||
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
|
||||
|
||||
# Create the new column order
|
||||
new_column_order = columns_to_move + remaining_columns
|
||||
|
||||
# Reorder the DataFrame
|
||||
clustering_features = clustering_features[new_column_order]
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.merge(
|
||||
clustering_features,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"customer_asset_id": "Org. ref.",
|
||||
"external_address_id": "Address ID",
|
||||
"cluster": "Archetype ID",
|
||||
"archetype_representative": "Archetype Representative",
|
||||
"rank": "Archetype Group Rank",
|
||||
}
|
||||
)
|
||||
archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
|
||||
# Create an extract of the features
|
||||
|
||||
|
||||
# Look at number of combinations
|
||||
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
|
||||
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
|
||||
# massively to 237 unique combinations
|
||||
# - Adding roof description to the mix, we have 857 unique combinations
|
||||
# - Adding floor description, we have 1278 unique combinations
|
||||
# This doesn't even begin to consider the other variables that we have in the dataset, such as the property dimensions,
|
||||
# location, and other factors.
|
||||
# Ideally, we would perfectly separate these variables but this is not possible, given the constraint of needing ~450
|
||||
# archetypes. We will need to make some compromises here. This is where a clustering algorithm can help us.
|
||||
# We don't end up with perfect separation but we can get a good enough separation to make the archetypes useful, and can
|
||||
# base the archetypes on a number of energy performance metrics, as well as location and other factors.
|
||||
# archetyped_asset_list[
|
||||
# ["property-type", "built-form", "walls-description", "roof-description",
|
||||
# "floor-description"]].drop_duplicates().shape
|
||||
|
||||
# Save this as an excel
|
||||
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
|
||||
|
||||
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
|
||||
mapping_data = stonewater_asset_list[
|
||||
stonewater_asset_list["archetype_representative"]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
|
||||
|
||||
mapping_data = mapping_data.merge(
|
||||
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
|
||||
)
|
||||
mapping_data = mapping_data.drop(columns=["internal_id"])
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
|
||||
f.write(json.dumps(mapping_data.to_dict(orient="records")))
|
||||
|
||||
# We also include some data for visualising the breakdown of EPCS
|
||||
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
|
||||
# Invert the true and false
|
||||
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
|
||||
proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
columns={"estimated": "is_real_epc"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
|
||||
# Produce the breakdown of EPC ratings
|
||||
epc_rating_breakdown = (
|
||||
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
|
||||
.value_counts()
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
epc_rating_breakdown = epc_rating_breakdown.rename(
|
||||
columns={"current-energy-rating": "EPC"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
|
||||
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
|
||||
|
||||
epc_a_properties = clustering_features[
|
||||
(clustering_features["current-energy-rating"] == "A")
|
||||
& (~clustering_features["estimated"])
|
||||
]
|
||||
|
||||
epc_a_properties = epc_a_properties.merge(
|
||||
stonewater_asset_list,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
|
@ -14,6 +14,11 @@ import pandas as pd
|
|||
import time
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
|
||||
save_dataframe_to_s3_parquet, save_pickle_to_s3
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
|
@ -673,7 +678,8 @@ def compile_data():
|
|||
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
|
|
@ -1090,6 +1096,26 @@ def concatenate_row(row):
|
|||
return ', '.join(row.dropna().replace('', None).dropna().astype(str))
|
||||
|
||||
|
||||
def adjust_clusters(cluster_allocation, total_clusters):
|
||||
current_total = sum(cluster_allocation.values())
|
||||
adjustment = total_clusters - current_total
|
||||
if adjustment > 0:
|
||||
# Increase clusters, start from the largest group
|
||||
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
||||
cluster_allocation[group] += 1
|
||||
adjustment -= 1
|
||||
if adjustment == 0:
|
||||
break
|
||||
elif adjustment < 0:
|
||||
# Decrease clusters, start from the largest group
|
||||
for group in sorted(cluster_allocation, key=lambda x: -cluster_allocation[x]):
|
||||
cluster_allocation[group] -= 1
|
||||
adjustment += 1
|
||||
if adjustment == 0:
|
||||
break
|
||||
return cluster_allocation
|
||||
|
||||
|
||||
def compile_data_final():
|
||||
# Updated version:
|
||||
|
||||
|
|
@ -1103,7 +1129,8 @@ def compile_data_final():
|
|||
########################################################################
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
|
|
@ -1633,124 +1660,243 @@ def compile_data_final():
|
|||
# )
|
||||
|
||||
# from utils.s3 import read_pickle_from_s3
|
||||
# data = read_pickle_from_s3(
|
||||
# property_attributes = read_pickle_from_s3(
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
# )
|
||||
|
||||
# We perform some additional cleaning on the data
|
||||
import msgpack
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||
|
||||
cleaners = {
|
||||
"floor-description": FloorAttributes,
|
||||
'hotwater-description': HotWaterAttributes,
|
||||
'main-fuel': MainFuelAttributes,
|
||||
'mainheat-description': MainHeatAttributes,
|
||||
'mainheatcont-description': MainheatControlAttributes,
|
||||
'roof-description': RoofAttributes,
|
||||
'walls-description': WallAttributes,
|
||||
'windows-description': WindowAttributes,
|
||||
'lighting-description': LightingAttributes
|
||||
}
|
||||
|
||||
for variable_to_clean in cleaned.keys():
|
||||
|
||||
unique_descriptions = property_attributes[variable_to_clean].unique()
|
||||
clean_df = pd.DataFrame(cleaned[variable_to_clean])
|
||||
# Check if we have any
|
||||
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
|
||||
if missed:
|
||||
descriptions_to_append = []
|
||||
for description in missed:
|
||||
if variable_to_clean == "lighting-description":
|
||||
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
|
||||
else:
|
||||
cln = cleaners[variable_to_clean](description)
|
||||
to_append = {
|
||||
"original_description": description,
|
||||
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
|
||||
**cln.process()
|
||||
}
|
||||
descriptions_to_append.append(to_append)
|
||||
|
||||
descriptions_to_append = pd.DataFrame(descriptions_to_append)
|
||||
clean_df = pd.concat([clean_df, descriptions_to_append])
|
||||
|
||||
clean_df = clean_df.rename(
|
||||
columns={
|
||||
"thermal_transmittance": f"{variable_to_clean}_thermal_transmittance",
|
||||
"is_assumed": f"{variable_to_clean}_is_assumed",
|
||||
}
|
||||
)
|
||||
|
||||
if 'thermal_transmittance_unit' in clean_df.columns:
|
||||
clean_df = clean_df.drop(columns=['thermal_transmittance_unit'])
|
||||
|
||||
starting_size = len(property_attributes)
|
||||
property_attributes = property_attributes.merge(
|
||||
clean_df, how="left", left_on=variable_to_clean, right_on="original_description"
|
||||
)
|
||||
if starting_size != property_attributes.shape[0]:
|
||||
raise Exception("something went wrong")
|
||||
property_attributes = property_attributes.drop(columns=["original_description", "clean_description"])
|
||||
# Fill missings
|
||||
for k in clean_df.columns:
|
||||
if k in property_attributes.columns:
|
||||
property_attributes[k] = property_attributes[k].fillna("missing")
|
||||
|
||||
# We group some variables such as thermal transmittance for walls, roof, floors
|
||||
# ranges = {
|
||||
# "< 0.1": (0, 0.1),
|
||||
# "0.1 - 0.3": (0.1, 0.3),
|
||||
# "0.3 - 0.5": (0.3, 0.5),
|
||||
# "0.5 - 0.7": (0.5, 0.7),
|
||||
# "0.9 - 1": (0.9, 1),
|
||||
# "1 - 1.5": (1, 1.5),
|
||||
# "1.5 - 2": (1.5, 2),
|
||||
# "2+": (2, 2.5)
|
||||
# }
|
||||
|
||||
ranges = {
|
||||
"< 0.1": (0, 0.1),
|
||||
"0.1 - 0.3": (0.1, 0.3),
|
||||
"0.3 - 0.5": (0.3, 0.5),
|
||||
"0.5+": (0.5, 2.5),
|
||||
}
|
||||
|
||||
# Generate the lookup table
|
||||
thermal_transmittance_lookup_table = []
|
||||
for i in range(1, 251):
|
||||
value = i / 100
|
||||
for label, (low, high) in ranges.items():
|
||||
if low < value <= high:
|
||||
thermal_transmittance_lookup_table.append({"from": value, "to": label})
|
||||
break
|
||||
|
||||
# Convert to DataFrame for display
|
||||
thermal_transmittance_lookup_table = pd.DataFrame(thermal_transmittance_lookup_table)
|
||||
thermal_transmittance_lookup_table["from"] = thermal_transmittance_lookup_table["from"].astype(str)
|
||||
|
||||
thermal_transmittance_cols = [
|
||||
c for c in property_attributes.columns if "thermal_transmittance" in c and "unit" not in c
|
||||
]
|
||||
for i, col in enumerate(thermal_transmittance_cols):
|
||||
# Perform the mapping
|
||||
to_col = f"to_{col}"
|
||||
property_attributes[col] = property_attributes[col].astype(str)
|
||||
property_attributes = property_attributes.merge(
|
||||
thermal_transmittance_lookup_table.rename(columns={"to": to_col}),
|
||||
how="left",
|
||||
left_on=col,
|
||||
right_on="from",
|
||||
suffixes=("", f"_{i}")
|
||||
)
|
||||
property_attributes = property_attributes.drop(columns=["from", col])
|
||||
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
|
||||
|
||||
# Drop the description columns that are the keys in cleaned
|
||||
print("PUT ME BACK!!??")
|
||||
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
|
||||
# Perform the mapping
|
||||
|
||||
# CLUSTERING!!
|
||||
|
||||
# from sklearn.cluster import KMeans
|
||||
# from sklearn.preprocessing import OneHotEncoder
|
||||
# from scipy.spatial.distance import cdist
|
||||
#
|
||||
# property_attributes.set_index('internal_id', inplace=True)
|
||||
#
|
||||
# # Step 1: Prepare the data
|
||||
# # Identify categorical columns (you might need to adjust this)
|
||||
# categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
# for col in categorical_cols:
|
||||
# property_attributes[col] = property_attributes[col].astype(str)
|
||||
#
|
||||
# # Applying OneHotEncoder
|
||||
# encoder = OneHotEncoder(sparse=False)
|
||||
# encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
|
||||
#
|
||||
# # Creating a new DataFrame with encoded categorical data and original numerical data
|
||||
# numerical_data = property_attributes.select_dtypes(include=[np.number])
|
||||
# data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
|
||||
#
|
||||
# # Convert all column names to strings to satisfy KMeans requirements
|
||||
# data_for_clustering.columns = data_for_clustering.columns.astype(str)
|
||||
#
|
||||
# # Step 2: K-Means Clustering
|
||||
# k = 450 # number of clusters
|
||||
# kmeans = KMeans(n_clusters=k, random_state=0)
|
||||
# property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
|
||||
#
|
||||
# # Extracting centroids
|
||||
# centroids = kmeans.cluster_centers_
|
||||
#
|
||||
# # Step 3: Assign clusters and rank rows
|
||||
# # Calculating distances from each point to its cluster's centroid
|
||||
# distances = cdist(data_for_clustering, centroids, 'euclidean')
|
||||
# min_distances = distances.min(axis=1)
|
||||
# property_attributes['distance_to_centroid'] = min_distances
|
||||
#
|
||||
# # Ranking rows by distance within each cluster
|
||||
# property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
#
|
||||
# # Sorting to verify
|
||||
# property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
#
|
||||
# # Optional: Displaying the dataframe
|
||||
# print(property_attributes.head())
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from scipy.spatial.distance import cdist
|
||||
id_column = 'internal_id'
|
||||
property_attributes.set_index(id_column, inplace=True)
|
||||
grouping_columns = [
|
||||
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
|
||||
]
|
||||
|
||||
# Define the preprocessing for numerical and categorical features
|
||||
numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
categorical_features = [c for c in categorical_features if c not in ["internal_id", grouping_columns]]
|
||||
|
||||
for col in categorical_features:
|
||||
property_attributes[col] = property_attributes[col].astype(str)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
id_column = 'internal_id'
|
||||
n_clusters = 450
|
||||
random_state = 0
|
||||
|
||||
training_data_grouped = property_attributes.groupby(grouping_columns)
|
||||
group_sizes = {name: len(group) for name, group in training_data_grouped}
|
||||
total_size = sum(group_sizes.values())
|
||||
cluster_allocation = {
|
||||
name: max(1, int(round(n_clusters * (size / total_size)))) for name, size in group_sizes.items()
|
||||
}
|
||||
|
||||
# Adjust cluster allocation to ensure total clusters sum to 450
|
||||
cluster_allocation = adjust_clusters(cluster_allocation, n_clusters)
|
||||
|
||||
# TODO: This code throws many warnings because of the highly fragmented dataframe. We should re-factor this to
|
||||
# collect the results of the clustering and then perform the transformations afterwards
|
||||
|
||||
final_clusters = []
|
||||
for group_variables, group_data in tqdm(training_data_grouped, total=len(training_data_grouped)):
|
||||
|
||||
group_n_clusters = cluster_allocation[group_variables]
|
||||
group_data.set_index(id_column, inplace=True)
|
||||
|
||||
preprocessor = ColumnTransformer(
|
||||
transformers=[
|
||||
('num', StandardScaler(), numerical_features),
|
||||
('cat', OneHotEncoder(), categorical_features)
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=group_n_clusters, random_state=random_state))])
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(group_data)
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(group_data)
|
||||
|
||||
# Get cluster labels
|
||||
group_data['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
|
||||
# if the data isn't an array, make it one
|
||||
if not isinstance(processed_data, np.ndarray):
|
||||
processed_data = processed_data.toarray()
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(group_data['cluster'])
|
||||
]
|
||||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=450, random_state=0))])
|
||||
group_data['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(property_attributes)
|
||||
# for cluster_id in group_data['cluster'].unique():
|
||||
# cluster_data = group_data[group_data['cluster'] == cluster_id]
|
||||
# min_distance = cluster_data['distance_to_centroid'].min()
|
||||
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
# if min_distance != 0:
|
||||
# print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
|
||||
# Transform the data using the fitted pipeline
|
||||
processed_data = pipeline.named_steps['preprocessor'].transform(property_attributes)
|
||||
# Ranking rows by distance within each cluster
|
||||
group_data['rank'] = group_data.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
|
||||
# Get cluster labels
|
||||
property_attributes['cluster'] = pipeline.named_steps['kmeans'].labels_
|
||||
# Sorting to verify
|
||||
group_data.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
group_data.reset_index(inplace=True)
|
||||
|
||||
# Get centroids (already in the same transformed space)
|
||||
centroids = pipeline.named_steps['kmeans'].cluster_centers_
|
||||
to_append = group_data[["internal_id", "cluster", "rank"]].copy()
|
||||
to_append["cluster"] = to_append["cluster"].astype(str) + str(group_variables)
|
||||
final_clusters.append(to_append)
|
||||
|
||||
processed_data = processed_data.toarray()
|
||||
final_clusters = pd.concat(final_clusters)
|
||||
# remap the clusters from the current names to 1 -> n_clusters
|
||||
|
||||
# Calculate distances from each point to the centroid of its cluster
|
||||
distances_to_centroids = [
|
||||
cdist(processed_data[i].reshape(1, -1), centroids[label].reshape(1, -1)).flatten()[0]
|
||||
for i, label in enumerate(property_attributes['cluster'])
|
||||
]
|
||||
|
||||
property_attributes['distance_to_centroid'] = distances_to_centroids
|
||||
|
||||
for cluster_id in property_attributes['cluster'].unique():
|
||||
cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
|
||||
min_distance = cluster_data['distance_to_centroid'].min()
|
||||
print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
if min_distance != 0:
|
||||
print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
|
||||
# Ranking rows by distance within each cluster
|
||||
property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(
|
||||
method='first')
|
||||
|
||||
# Sorting to verify
|
||||
property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
cluster_mapping = {cluster: i for i, cluster in enumerate(final_clusters["cluster"].unique())}
|
||||
final_clusters["cluster"] = final_clusters["cluster"].map(cluster_mapping)
|
||||
final_clusters["cluster"] = final_clusters["cluster"].astype(str)
|
||||
|
||||
################################################
|
||||
# Prepare outputs!!!!
|
||||
################################################
|
||||
|
||||
property_attributes.reset_index(inplace=True)
|
||||
property_attributes = property_attributes.merge(
|
||||
final_clusters, how="left", on="internal_id"
|
||||
)
|
||||
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
|
||||
|
||||
asset_list_with_archetypes = asset_list.merge(
|
||||
|
|
@ -1769,7 +1915,7 @@ def compile_data_final():
|
|||
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
|
||||
"archetype_representative"].fillna(False)
|
||||
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes V2.csv", index=False)
|
||||
|
||||
stonewater_uprn_lookup = asset_list_with_archetypes[
|
||||
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
|
||||
|
|
@ -1777,110 +1923,6 @@ def compile_data_final():
|
|||
|
||||
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
|
||||
|
||||
################################################
|
||||
# Agglomertive Clustering
|
||||
################################################
|
||||
|
||||
# from sklearn.cluster import KMeans, AgglomerativeClustering
|
||||
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
# from sklearn.compose import ColumnTransformer
|
||||
# from sklearn.pipeline import Pipeline
|
||||
# from scipy.spatial.distance import cdist
|
||||
# import numpy as np
|
||||
# from collections import Counter
|
||||
#
|
||||
# id_column = 'internal_id'
|
||||
# property_attributes.set_index(id_column, inplace=True)
|
||||
#
|
||||
# # Define the preprocessing for numerical and categorical features
|
||||
# numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
# categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
#
|
||||
# for col in categorical_features:
|
||||
# property_attributes[col] = property_attributes[col].astype(str)
|
||||
#
|
||||
# preprocessor = ColumnTransformer(
|
||||
# transformers=[
|
||||
# ('num', StandardScaler(), numerical_features),
|
||||
# ('cat', OneHotEncoder(sparse_output=False), categorical_features)
|
||||
# ]
|
||||
# )
|
||||
#
|
||||
# # Function to perform clustering and merge small clusters
|
||||
# def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
|
||||
# while True:
|
||||
# # Preprocess the data
|
||||
# processed_data = preprocessor.fit_transform(data)
|
||||
#
|
||||
# # Initial clustering
|
||||
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
||||
# labels = clustering.fit_predict(processed_data)
|
||||
#
|
||||
# # Check cluster sizes
|
||||
# cluster_counts = Counter(labels)
|
||||
#
|
||||
# # Find clusters smaller than min_size
|
||||
# small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
|
||||
#
|
||||
# if not small_clusters:
|
||||
# break
|
||||
#
|
||||
# # Merge small clusters
|
||||
# for cluster in small_clusters:
|
||||
# # Find the nearest cluster to merge with
|
||||
# cluster_data = processed_data[labels == cluster]
|
||||
# other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
|
||||
# other_cluster_data = [processed_data[labels == i] for i in other_clusters]
|
||||
# other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
|
||||
#
|
||||
# distances = cdist(cluster_data, other_centroids).mean(axis=0)
|
||||
# closest_cluster = other_clusters[np.argmin(distances)]
|
||||
#
|
||||
# labels[labels == cluster] = closest_cluster
|
||||
#
|
||||
# n_clusters -= len(small_clusters)
|
||||
#
|
||||
# return labels
|
||||
#
|
||||
# # Perform clustering with minimum size constraint
|
||||
# n_clusters = 10
|
||||
# min_size = 5
|
||||
# property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
|
||||
#
|
||||
# # Filter out empty clusters
|
||||
# valid_clusters = property_attributes['cluster'].unique()
|
||||
#
|
||||
# # Get centroids for the resulting clusters
|
||||
# processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
|
||||
# centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
|
||||
#
|
||||
# # Calculate distances from each point to the centroid of its cluster
|
||||
# distances_to_centroids = [
|
||||
# cdist(processed_data[i].reshape(1, -1),
|
||||
# centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
|
||||
# for i, label in enumerate(property_attributes['cluster'])
|
||||
# ]
|
||||
#
|
||||
# property_attributes['distance_to_centroid'] = distances_to_centroids
|
||||
#
|
||||
# # Verify that at least one point in each cluster has zero distance to the centroid
|
||||
# for cluster_id in valid_clusters:
|
||||
# cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
|
||||
# min_distance = cluster_data['distance_to_centroid'].min()
|
||||
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
# if min_distance != 0:
|
||||
# print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
#
|
||||
# # Rank the distances within each cluster
|
||||
# property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
|
||||
# .rank(method='first')
|
||||
#
|
||||
# # Reset index to get 'internal_id' back
|
||||
# property_attributes.reset_index(inplace=True)
|
||||
#
|
||||
# # Display the DataFrame
|
||||
# print(property_attributes)
|
||||
|
||||
|
||||
def pull_ideal_postcodes(missing_uprn_with_udprn):
|
||||
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
|
|||
self.description: str = description.lower()
|
||||
|
||||
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
|
||||
description in self.OBSERVED_ERRORS)
|
||||
description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")
|
||||
|
||||
# Try and perform a translation, incase it's in welsh
|
||||
self.translate_welsh_text()
|
||||
|
|
|
|||
|
|
@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
|
|||
def __init__(self, description: str):
|
||||
self.description: str = clean_description(description.lower()).strip()
|
||||
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
self.description == "sap05 hot-water"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
import re
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||
from etl.epc_clean.utils import correct_spelling
|
||||
|
||||
|
||||
class LightingAttributes:
|
||||
class LightingAttributes(Definitions):
|
||||
WELSH_TEXT = {
|
||||
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
|
||||
"dim goleuadau ynni-isel": "no low energy lighting",
|
||||
"goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
|
||||
}
|
||||
|
||||
OBSERVED_ERRORS = []
|
||||
|
||||
def __init__(self, description, averages):
|
||||
self.description: str = clean_description(description.lower())
|
||||
|
||||
|
|
@ -18,6 +21,9 @@ class LightingAttributes:
|
|||
self.description = correct_spelling(self.description)
|
||||
self.averages = averages
|
||||
|
||||
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
|
||||
description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
|
||||
|
||||
def welsh_translation_search(self):
|
||||
"""
|
||||
For welsh text describing the percentage of low energy lighting, we match the regular
|
||||
|
|
@ -40,6 +46,9 @@ class LightingAttributes:
|
|||
|
||||
description = self.description
|
||||
|
||||
if self.nodata:
|
||||
return {"low_energy_proportion": None}
|
||||
|
||||
if 'no low energy lighting' in description:
|
||||
return {"low_energy_proportion": 0}
|
||||
|
||||
|
|
|
|||
|
|
@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):
|
|||
|
||||
self.description: str = clean_description(self.description).strip()
|
||||
# Remove special characters
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
description == "SAP05:Main-Heating"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):
|
|||
|
||||
self.process_edge_cases()
|
||||
|
||||
if (not description or not any(
|
||||
rt in self.description for rt in
|
||||
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
|
||||
) and not self.is_edge_case):
|
||||
raise ValueError('Invalid description')
|
||||
if not self.nodata:
|
||||
if (not description or not any(
|
||||
rt in self.description for rt in
|
||||
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
|
||||
) and not self.is_edge_case):
|
||||
raise ValueError('Invalid description')
|
||||
|
||||
def process_edge_cases(self) -> (dict, bool):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):
|
|||
|
||||
def __init__(self, description: str):
|
||||
self.description: str = clean_description(description.lower()).strip()
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
description == "SAP05:Main-Heating-Controls"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
|
|||
|
|
@ -75,12 +75,19 @@ class WallAttributes(Definitions):
|
|||
'insulation_thickness', 'external_insulation', 'internal_insulation'
|
||||
]
|
||||
|
||||
CORRECTIONS = {
|
||||
"Granite or whin, as built, no insulation (assumed)": "Granite or whinstone, as built, no insulation (assumed)",
|
||||
}
|
||||
|
||||
def __init__(self, description: str):
|
||||
"""
|
||||
:param description: Description of the walls.
|
||||
"""
|
||||
self.description: str = description
|
||||
|
||||
if self.description in self.CORRECTIONS:
|
||||
self.description = self.CORRECTIONS[self.description]
|
||||
|
||||
self.welsh_translation_search()
|
||||
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class WindowAttributes(Definitions):
|
|||
|
||||
# In the case of an empty description, we want to return a dictionary with all values set to False
|
||||
# and indicate there was no data
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@ import re
|
|||
import string
|
||||
from typing import Tuple, Union, Dict, List
|
||||
|
||||
THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance (-?\d+(\.\d+)?)\s(w/m\S+k)"
|
||||
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR)
|
||||
THERMAL_TRANSMITTANCE_STR = r"average thermal transmittance\s*[=:-]?\s*(-?\d+(\.\d+)?)\s*[wW]/m\S*[kK]"
|
||||
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTANCE_STR, re.IGNORECASE)
|
||||
|
||||
DOUBLE_SPACE_PATTERN = re.compile(r"\s+")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue