Merge pull request #526 from Hestia-Homes/main

Adding in new etl pipeline handling
This commit is contained in:
KhalimCK 2025-11-03 18:51:45 +00:00 committed by GitHub
commit 058f91ad77
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 1134 additions and 292 deletions

4
.gitignore vendored
View file

@ -275,4 +275,6 @@ cache/
*/.idea
*.png
*.pptx
*.pptx
local_data*

View file

@ -4,6 +4,7 @@ import pandas as pd
from etl.epc.settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
POST_SAP10_DATE,
# IGNORED_TRANSACTION_TYPES,
IGNORED_FLOOR_LEVELS,
IGNORED_PROPERTY_TYPES,
@ -21,7 +22,7 @@ from etl.epc.settings import (
ENDING_SUFFIX_COMPONENT_COLS,
POTENTIAL_COLUMNS,
EFFICIENCY_FEATURES,
DATA_ANOMALY_MATCHES
DATA_ANOMALY_MATCHES,
)
from recommendations.rdsap_tables import FLOOR_LEVEL_MAP
@ -47,6 +48,8 @@ construction_age_bounds_map = {
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
"England and Wales: 2012-2021": {"l": 2012, "u": 2021},
"England and Wales: 2022 onwards": {"l": 2022, "u": 3000},
}
construction_age_remap = {
@ -157,6 +160,9 @@ class EPCDataProcessor:
# colnames=["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
# )
# Create post sap10 flag
self.create_post_sap10_flag()
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
cleaning_averages = self.cleaning_averages.copy()
if self.run_mode == "newdata":
@ -173,6 +179,13 @@ class EPCDataProcessor:
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower()
def create_post_sap10_flag(self):
"""
Create a flag to indicate if the epc is post sap10
"""
self.data["is_post_sap10"] = self.data["LODGEMENT_DATE"] >= POST_SAP10_DATE
def cast_data_columns_to_lower(self):
"""
Convert all columns names to lower
@ -247,7 +260,8 @@ class EPCDataProcessor:
# Map all anomaly values to None
data_anomaly_map = dict(
zip(
DATA_ANOMALY_MATCHES, [None] * len(DATA_ANOMALY_MATCHES),
DATA_ANOMALY_MATCHES,
[None] * len(DATA_ANOMALY_MATCHES),
)
)
@ -374,7 +388,7 @@ class EPCDataProcessor:
has_missings = pd.isnull(self.data[col]).sum()
while has_missings:
self.data = apply_clean(
data=self.data, matching_columns=matching_columns[0: to_index + 1]
data=self.data, matching_columns=matching_columns[0 : to_index + 1]
)
has_missings = pd.isnull(self.data[col]).sum()
@ -747,6 +761,12 @@ class EPCDataProcessor:
self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])]
# Remove any walls described as Basement walls since these are non-standard
# TODO: CHECK IF WE SHOULD MAP THESE U VALUES INSTEAD
index_to_remove = self.data["WALLS_DESCRIPTION"] == "Basement wall"
print(f"Removing {index_to_remove.sum()} records with basement walls")
self.data = self.data[~index_to_remove]
# Because park homes are surveyed unusually (for example, we don't have u-values to
# look up for their different components, they need to be collected in survey and aren't reflected in
# EPCs) we'll ignore them from the model
@ -848,7 +868,9 @@ class EPCDataProcessor:
# Fill NaN values with averages
for col in cols_to_clean:
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"])
data_to_clean[col] = data_to_clean[col].fillna(
data_to_clean[f"{col}_AVERAGE"]
)
data_to_clean = data_to_clean.drop(columns=[f"{col}_AVERAGE"])
# If we still have missings
data_to_clean[col] = data_to_clean[col].fillna(data_to_clean[col].mean())

View file

@ -8,7 +8,9 @@ from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import (
MainheatControlAttributes,
)
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
@ -169,7 +171,7 @@ class TrainingDataset(BaseDataset):
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
self._feature_generation()
self._drop_features()
# self._drop_features()
self._clean_efficiency_variables()
self._null_validation(information="Clean Efficiency Variables")
self._expand_description_to_features(cleaned_lookup)
@ -210,11 +212,11 @@ class TrainingDataset(BaseDataset):
common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]
self.df = self.df.loc[
:,
no_suffix_cols
+ only_ending_cols
+ [col for cols in common_cols for col in cols],
]
:,
no_suffix_cols
+ only_ending_cols
+ [col for cols in common_cols for col in cols],
]
def _remove_abnormal_change_in_floor_area(self):
"""
@ -519,7 +521,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_sandstone_or_limestone"]
== expanded_df["is_sandstone_or_limestone_ending"]
)
]
]
elif component == "floor":
expanded_df = expanded_df[
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
@ -536,7 +538,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_to_external_air"]
== expanded_df["is_to_external_air_ending"]
)
]
]
elif component == "roof":
expanded_df = expanded_df[
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
@ -549,7 +551,7 @@ class TrainingDataset(BaseDataset):
expanded_df["has_dwelling_above"]
== expanded_df["has_dwelling_above_ending"]
)
]
]
return expanded_df
@ -695,10 +697,14 @@ class TrainingDataset(BaseDataset):
cleaned_lookup_df_for_key = pd.DataFrame(cleaned_lookup[cleaned_key])
# We handle a specific edge case where we're missing information for the original description
descriptions = [x for x in self.df[left_on_starting].unique() if pd.notnull(x)]
descriptions = [
x for x in self.df[left_on_starting].unique() if pd.notnull(x)
]
# take any not in the cleaned lookup
missing_descriptions = [
x for x in descriptions if x not in cleaned_lookup_df_for_key["original_description"].values
x
for x in descriptions
if x not in cleaned_lookup_df_for_key["original_description"].values
]
if missing_descriptions:
# We handle them here
@ -707,12 +713,18 @@ class TrainingDataset(BaseDataset):
for x in missing_descriptions:
desc_cleaner = cleaner(x)
cleaned = desc_cleaner.process()
# IF NODATA, REMAP TO NONE VALUES
if all((pd.DataFrame(cleaned, index=[0]).T)[0] == False):
cleaned = {key: None for key in cleaned.keys()}
cleaned_data.append(
{
"original_description": x,
"clean_description": desc_cleaner.description.replace("(assumed)",
"").rstrip().capitalize(),
**cleaned
"clean_description": desc_cleaner.description.replace(
"(assumed)", ""
)
.rstrip()
.capitalize(),
**cleaned,
}
)
cleaned_lookup_df_for_key = pd.concat(

View file

@ -23,6 +23,7 @@ from etl.epc.settings import (
POTENTIAL_COLUMNS,
ROOM_FEATURES,
COST_FEATURES,
POST_SAP10_FEATURE,
)
# TODO: change in setting file
@ -76,6 +77,51 @@ new_walls_description_mapping.loc[
clean_lookup["walls-description"] = new_walls_description_mapping.to_dict(
orient="records"
)
# TODO: THIS IS A TEMPORARY FIX
new_mainheatcont_mapping = pd.DataFrame(clean_lookup["mainheatcont-description"])
new_mainheatcont_mapping.loc[
new_mainheatcont_mapping["original_description"] == "SAP:Main-Heating-Controls",
[
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
],
] = None
new_mainheatcont_mapping.loc[
new_mainheatcont_mapping["original_description"] == " ",
[
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
],
] = None
clean_lookup["mainheatcont-description"] = new_mainheatcont_mapping.to_dict(
orient="records"
)
# TEMP FIX - GRANITE OR WHINSTONE BOOLEAN ISSUE
new_walls_description_mapping = pd.DataFrame(clean_lookup["walls-description"])
new_walls_description_mapping.loc[
new_walls_description_mapping["original_description"].str.contains("Granite"),
"is_granite_or_whinstone",
] = True
clean_lookup["walls-description"] = new_walls_description_mapping.to_dict(
orient="records"
)
class EPCPipeline:
@ -280,7 +326,9 @@ class EPCPipeline:
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = property_data[VARIABLE_DATA_FEATURES + COST_FEATURES]
variable_data = property_data[
VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE
]
uprn = str(uprn)
epc_records = [

View file

@ -20,6 +20,7 @@ from etl.epc.settings import (
COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
ROOM_FEATURES,
POST_SAP10_FEATURE,
)
from recommendations.recommendation_utils import estimate_number_of_floors
from utils.s3 import read_dataframe_from_s3_parquet
@ -89,6 +90,7 @@ class EPCRecord:
co2_emissions_current: float = None
number_habitable_rooms: float = None
number_heated_rooms: float = None
is_post_sap10: bool = None
# u_values_walls = None
# u_values_roof = None
@ -277,6 +279,7 @@ class EPCRecord:
self.number_heated_rooms: float = float(
self.prepared_epc["number_heated_rooms"]
)
self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"])
def _identify_delta_between_prepared_and_original_records(self):
"""
@ -385,11 +388,11 @@ class EPCRecord:
return df
def _clean_floor_height(self):
""" Remaps anomalies in floor height to the average floor height for the property type """
"""Remaps anomalies in floor height to the average floor height for the property type"""
floor_height_data = self.cleaning_data[
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
(self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
]
(self.cleaning_data["property_type"] == self.prepared_epc["property-type"])
& (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
]
average = floor_height_data["floor_height"].mean()
sd = floor_height_data["floor_height"].std()
# If we're in the top 0.5 percentile of floor heights, we'll set it to the average
@ -399,14 +402,16 @@ class EPCRecord:
self.prepared_epc["floor-height"] = average
def _clean_new_build_descriptions(self):
for col in ['roof-description', 'walls-description', 'floor-description']:
for col in ["roof-description", "walls-description", "floor-description"]:
self.prepared_epc[col] = self.prepared_epc[col].replace("W/m²K", "W/m-¦K")
def _clean_constituency(self):
"""
We handle the single case of finding a missing constituency by using the local authority
"""
if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
if pd.isnull(self.prepared_epc["constituency"]) or (
self.prepared_epc["constituency"] == ""
):
if self.prepared_epc["local-authority"] != "E06000044":
raise NotImplementedError(
"This function is only implemented for Portsmouth, in the single edgecase seen"
@ -595,12 +600,12 @@ class EPCRecord:
# We handle the edge case of floor area being 0. We set it to zero and it is cleaned by
# _clean_with_data_processor
if self.prepared_epc['total-floor-area'] == 0:
if self.prepared_epc["total-floor-area"] == 0:
print(
"Edge case of floor area being zero - will set to none and will be cleaned in "
"_clean_with_data_processor"
)
self.prepared_epc['total-floor-area'] = None
self.prepared_epc["total-floor-area"] = None
def _clean_mains_gas(self):
"""
@ -609,12 +614,7 @@ class EPCRecord:
if not self.prepared_epc:
raise ValueError("EPC Recrod doesn not contain epc data")
mains_gas_map = {
"Y": True,
"N": False,
True: True,
False: False
}
mains_gas_map = {"Y": True, "N": False, True: True, False: False}
self.prepared_epc["mains-gas-flag"] = (
None
@ -1064,7 +1064,12 @@ class EPCDifferenceRecord:
CARBON_RESPONSE
)
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES + ROOM_FEATURES
component_variables = (
COMPONENT_FEATURES
+ EFFICIENCY_FEATURES
+ ROOM_FEATURES
+ POST_SAP10_FEATURE
)
ending_record = self.record2.get(
component_variables + ["lodgement_date"],
return_asdict=True,

View file

@ -12,7 +12,7 @@ def main():
"""
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
# directories = directories[0:3]
# directories = directories[235:275]
epc_pipeline = EPCPipeline(
directories=directories,

View file

@ -49,9 +49,12 @@ DATA_ANOMALY_MATCHES = {
# An older value which rarely shows up but has been seen in the data.
"UNKNOWN",
#
"Unknown"
"Unknown",
}
# Add the post_sap10 date to indicate if the epc is post sap10
POST_SAP10_DATE = "2025-06-22"
DATA_ANOMALY_SUBSTRINGS = {
# Where values in a pick list that have been superseded by another value. For example, where a value for
# pitched roof has been replaced by three sub-categories of pitched roof. The original value is retained
@ -184,6 +187,8 @@ EFFICIENCY_FEATURES = [
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
POST_SAP10_FEATURE = ["is_post_sap10"]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant

View file

@ -1,17 +1,26 @@
import re
from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types
from etl.epc_clean.epc_attributes.attribute_utils import (
extract_thermal_transmittance,
extract_component_types,
)
class FloorAttributes(Definitions):
DWELLING_BELOW = ["another dwelling below", "other premises below"]
FLOOR_TYPES = ["assumed", "to unheated space", "to external air", "suspended", "solid"]
FLOOR_TYPES = [
"assumed",
"to unheated space",
"to external air",
"suspended",
"solid",
]
# For the short term, while we are still exploring the data, we maintain a list of error cases which
# we want to ignore and consider as no data.
OBSERVED_ERRORS = ["Conservatory", "insulated"]
OBSERVED_ERRORS = ["Conservatory", "insulated", "Basement"]
WELSH_TEXT = {
"(anheddiad arall islaw)": "(another dwelling below)",
@ -35,32 +44,40 @@ class FloorAttributes(Definitions):
"i ofod heb ei wresogi, heb ei inswleiddio (rhagdybiaeth)": "to unheated space, no insulation (assumed)",
"i ofod heb ei wresogi, dim inswleiddio": "to unheated space, no insulation",
"igçör awyr y tu allan, wedigçöi inswleiddio (rhagdybiaeth)": "to external air, insulated (assumed)",
"crog, inswleiddio cyfyngedig (rhagdybiaeth)": "suspended, limited insulation (assumed)"
"crog, inswleiddio cyfyngedig (rhagdybiaeth)": "suspended, limited insulation (assumed)",
}
def __init__(self, description: str):
self.description: str = description.lower()
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")
self.nodata = (
(not description)
or (description in self.DATA_ANOMALY_MATCHES)
or (description in self.OBSERVED_ERRORS)
or (self.description == "sap05:floor")
)
# Try and perform a translation, incase it's in welsh
self.translate_welsh_text()
if not self.nodata and not any(
rt in self.description for rt in
self.FLOOR_TYPES + self.DWELLING_BELOW + ["average thermal transmittance"]
rt in self.description
for rt in self.FLOOR_TYPES
+ self.DWELLING_BELOW
+ ["average thermal transmittance"]
):
raise ValueError('Invalid description')
raise ValueError("Invalid description")
def translate_welsh_text(self):
uvalue_match = re.search(
r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k', self.description
r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k",
self.description,
)
uvalue_match2 = re.search(
r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k', self.description
r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k",
self.description,
)
# Step 2: Generalized translation with placeholder
@ -69,7 +86,7 @@ class FloorAttributes(Definitions):
uvalue = uvalue_match.group(1)
else:
uvalue = uvalue_match2.group(1)
self.description = f'average thermal transmittance {uvalue} w/m-¦k'
self.description = f"average thermal transmittance {uvalue} w/m-¦k"
else:
translation = self.WELSH_TEXT.get(self.description)
@ -89,11 +106,15 @@ class FloorAttributes(Definitions):
result, description = extract_thermal_transmittance(result, description)
# floor type
result, description = extract_component_types(result, description, list_of_components=self.FLOOR_TYPES)
result, description = extract_component_types(
result, description, list_of_components=self.FLOOR_TYPES
)
# check if there is another dwelling below
result['another_property_below'] = "(another dwelling below)" in description or "(other premises below)" in \
description
result["another_property_below"] = (
"(another dwelling below)" in description
or "(other premises below)" in description
)
thickness_map = {
"external insulation": "average",
@ -102,17 +123,17 @@ class FloorAttributes(Definitions):
"partial insulation": "below average",
"no insulation": "none",
"additional insulation": "above average",
"insulated": "average"
"insulated": "average",
}
for key, value in thickness_map.items():
if key in description:
result['insulation_thickness'] = value
result["insulation_thickness"] = value
break
else:
result['insulation_thickness'] = None
result["insulation_thickness"] = None
if result["another_property_below"]:
result["thermal_transmittance"] = 0
result["thermal_transmittance_unit"] = 'w/m-¦k'
result["thermal_transmittance_unit"] = "w/m-¦k"
return result

View file

@ -1,12 +1,28 @@
import re
from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance
from etl.epc_clean.epc_attributes.attribute_utils import (
extract_component_types,
extract_thermal_transmittance,
)
class RoofAttributes(Definitions):
ROOF_TYPES = ['pitched', 'roof room', 'loft', 'flat', 'thatched', 'at rafters', 'assumed']
DWELLING_ABOVE = ["another dwelling above", "other premises above", "other dwelling above"]
ROOF_TYPES = [
"pitched",
"roof room",
"loft",
"flat",
"thatched",
"at rafters",
"assumed",
]
DWELLING_ABOVE = [
"another dwelling above",
"other premises above",
"other dwelling above",
"(same dwelling above)",
]
WELSH_TEXT = {
"ar oleddf, dim inswleiddio": "pitched, no insulation",
@ -18,10 +34,10 @@ class RoofAttributes(Definitions):
"ar oleddf, wedi?i inswleiddio": "pitched, insulated",
"ar oleddf, inswleiddio cyfyngedig (rhagdybiaeth)": "pitched, limited insulation (assumed)",
"ar oleddf, inswleiddio cyfyngedig": "pitched, limited insulation",
"ar oleddf, wedigçöi inswleiddio wrth y trawstiau": 'pitched, insulated at rafters',
"ar oleddf, wedi?i inswleiddio wrth y trawstiau": 'pitched, insulated at rafters',
"ar oleddf, wedi?i inswleiddio wrth y trawstia": 'pitched, insulated at rafters',
"ar oleddf, wedigçöi inswleiddio wrth y trawstia": 'pitched, insulated at rafters',
"ar oleddf, wedigçöi inswleiddio wrth y trawstiau": "pitched, insulated at rafters",
"ar oleddf, wedi?i inswleiddio wrth y trawstiau": "pitched, insulated at rafters",
"ar oleddf, wedi?i inswleiddio wrth y trawstia": "pitched, insulated at rafters",
"ar oleddf, wedigçöi inswleiddio wrth y trawstia": "pitched, insulated at rafters",
"yn wastad, inswleiddio cyfyngedig (rhagdybiaeth)": "flat, limited insulation (assumed)",
"yn wastad, inswleiddio cyfyngedig": "flat, limited insulation",
"yn wastad, dim inswleiddio (rhagdybiaeth)": "flat, no insulation (assumed)",
@ -43,9 +59,18 @@ class RoofAttributes(Definitions):
}
DEFAULT_KEYS = [
'thermal_transmittance', 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room',
'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', 'has_dwelling_above',
'is_valid', 'insulation_thickness'
"thermal_transmittance",
"thermal_transmittance_unit",
"is_pitched",
"is_roof_room",
"is_loft",
"is_flat",
"is_thatched",
"is_at_rafters",
"is_assumed",
"has_dwelling_above",
"is_valid",
"insulation_thickness",
]
def __init__(self, description: str):
@ -54,14 +79,21 @@ class RoofAttributes(Definitions):
"""
self.description: str = description.lower().strip()
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or self.description == "sap05:roof"
self.nodata = (
not description
or description in self.DATA_ANOMALY_MATCHES
or self.description == "sap05:roof"
)
self.welsh_translation_search()
if not self.nodata and not any(
rt in self.description for rt in self.ROOF_TYPES + self.DWELLING_ABOVE + ["average thermal transmittance"]
rt in self.description
for rt in self.ROOF_TYPES
+ self.DWELLING_ABOVE
+ ["average thermal transmittance"]
):
raise ValueError('Invalid description')
raise ValueError("Invalid description")
def welsh_translation_search(self):
"""
@ -76,7 +108,7 @@ class RoofAttributes(Definitions):
r"ar oleddf, (\d+ mm) lo inswleiddio yn y llof",
r"ar oleddf, (\d+\+ mm) lo inswleiddio yn y llof",
r"ar oleddf, (\d+mm) o inswleiddio yn y llofft",
r"ar oleddf, (\d+\+ mm) o inswleiddio yn y llofft"
r"ar oleddf, (\d+\+ mm) o inswleiddio yn y llofft",
]
li_thickness_match = None
for regex in loft_insulation_regexes:
@ -84,9 +116,14 @@ class RoofAttributes(Definitions):
if li_thickness_match:
break
uvalue_search = re.search(r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k", self.description)
uvalue_search = re.search(
r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m-¦k",
self.description,
)
uvalue_search2 = re.search(
r'trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k', self.description, re.IGNORECASE
r"trawsyriannedd thermol cyfartalog (\d+(\.\d+)?)\s*w/m.+k",
self.description,
re.IGNORECASE,
)
# Step 2: Generalized translation with placeholder
@ -121,9 +158,13 @@ class RoofAttributes(Definitions):
result, description = extract_thermal_transmittance(result, description)
# roof type
result, description = extract_component_types(result, description, list_of_components=self.ROOF_TYPES)
result, description = extract_component_types(
result, description, list_of_components=self.ROOF_TYPES
)
result["has_dwelling_above"] = any([x in description for x in self.DWELLING_ABOVE])
result["has_dwelling_above"] = any(
[x in description for x in self.DWELLING_ABOVE]
)
for dwelling_above in self.DWELLING_ABOVE:
description = description.replace(dwelling_above, "")
@ -136,7 +177,7 @@ class RoofAttributes(Definitions):
# Search for a regular expression that matches 150 insulation
match = re.search(r"(\d+\+?)\s*insulation", description)
if match:
result['insulation_thickness'] = match.group(1)
result["insulation_thickness"] = match.group(1)
# insulation thickness
thickness_map = {
@ -149,21 +190,21 @@ class RoofAttributes(Definitions):
}
for key, value in thickness_map.items():
if key in description:
result['insulation_thickness'] = value
result["insulation_thickness"] = value
# Remove the match from the description
# description = description.replace(key, "")
break
# Extract insulation thickness in mm, if present
match = re.search(r'(\d+\+?)\s*mm', description)
match = re.search(r"(\d+\+?)\s*mm", description)
if match:
result['insulation_thickness'] = match.group(1)
result["insulation_thickness"] = match.group(1)
if "insulation_thickness" not in result:
result['insulation_thickness'] = None
result["insulation_thickness"] = None
if result["has_dwelling_above"]:
result["thermal_transmittance"] = 0
result["thermal_transmittance_unit"] = 'w/m-¦k'
result["thermal_transmittance_unit"] = "w/m-¦k"
return result

View file

@ -3,76 +3,78 @@ from typing import Dict, Union
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import (
extract_component_types,
extract_thermal_transmittance
extract_thermal_transmittance,
)
class WallAttributes(Definitions):
WALL_TYPES = ['cavity wall', 'filled cavity', 'solid brick', 'system built', 'timber frame', 'granite or whinstone',
'as built', 'cob', 'assumed', 'sandstone or limestone', "park home"]
WALL_TYPES = [
"cavity wall",
"filled cavity",
"solid brick",
"system built",
"timber frame",
"granite or whinstone",
"as built",
"cob",
"assumed",
"sandstone or limestone",
"park home",
]
WELSH_TEXT = {
"Briciau solet, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)":
"Solid brick, as built, no insulation (assumed)",
'Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)':
'Cavity wall, as built, partial insulation (assumed)',
'Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol':
'Cavity wall, as built, partial insulation',
'Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)':
'Cavity wall, as built, no insulation (assumed)',
'Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio':
'Cavity wall, as built, no insulation',
'Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)':
'Sandstone or limestone, as built, no insulation (assumed)',
'Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio':
'Sandstone or limestone, as built, no insulation',
'Waliau ceudod, ceudod wediGÇÖi lenwi': 'Cavity wall, filled cavity',
'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)':
'Cavity wall, as built, insulated (assumed)',
'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio':
'Cavity wall, as built, insulated',
'Gwenithfaen neu risgraig, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)':
'Granite or whinstone, as built, no insulation (assumed)',
'Waliau ceudod,': 'Cavity wall, as built, no insulation',
'Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)':
'Timber frame, as built, insulated (assumed)',
'Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio':
'Timber frame, as built, insulated',
'Gwenithfaen neu risgraig, gydag inswleiddio allanol': 'Granite or whinstone, with external insulation',
'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)':
'System built, as built, no insulation (assumed)',
'Tywodfaen, gydag inswleiddio mewnol': 'Sandstone or limestone, with internal insulation',
'Waliau ceudod, ynysydd allanol a llenwi ceudod': 'Cavity wall, filled cavity and external insulation',
'Gwenithfaen neu risgraig, gydag inswleiddio mewnol': 'Granite or whinstone, with internal insulation',
'Ffr+óm bren, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)':
'Timber frame, as built, partial insulation (assumed)',
'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)':
'System built, as built, insulated (assumed)',
'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio':
'System built, as built, insulated',
'WediGÇÖu hadeiladu yn +¦l system, gydag inswleiddio allanol': 'System built, with external insulation',
'Briciau solet, gydag inswleiddio mewnol': 'Solid brick, with internal insulation',
'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)':
'System built, as built, partial insulation (assumed)',
'WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol':
'System built, as built, partial insulation',
'Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)':
'Timber frame, as built, no insulation (assumed)',
'Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio':
'Timber frame, as built, no insulation',
'Tywodfaen, gydag inswleiddio allanol': 'Sandstone or limestone, with external insulation',
'Waliau ceudod, gydag inswleiddio allanol': 'Cavity wall, with external insulation',
'Briciau solet, gydag inswleiddio allanol': 'Solid brick, with external insulation',
"Briciau solet, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Solid brick, as built, no insulation (assumed)",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "Cavity wall, as built, partial insulation (assumed)",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, inswleiddio rhannol": "Cavity wall, as built, partial insulation",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Cavity wall, as built, no insulation (assumed)",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Cavity wall, as built, no insulation",
"Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Sandstone or limestone, as built, no insulation (assumed)",
"Tywodfaen, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Sandstone or limestone, as built, no insulation",
"Waliau ceudod, ceudod wediGÇÖi lenwi": "Cavity wall, filled cavity",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "Cavity wall, as built, insulated (assumed)",
"Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "Cavity wall, as built, insulated",
"Gwenithfaen neu risgraig, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Granite or whinstone, as built, no insulation (assumed)",
"Waliau ceudod,": "Cavity wall, as built, no insulation",
"Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "Timber frame, as built, insulated (assumed)",
"Ffr+óm bren, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "Timber frame, as built, insulated",
"Gwenithfaen neu risgraig, gydag inswleiddio allanol": "Granite or whinstone, with external insulation",
"WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "System built, as built, no insulation (assumed)",
"Tywodfaen, gydag inswleiddio mewnol": "Sandstone or limestone, with internal insulation",
"Waliau ceudod, ynysydd allanol a llenwi ceudod": "Cavity wall, filled cavity and external insulation",
"Gwenithfaen neu risgraig, gydag inswleiddio mewnol": "Granite or whinstone, with internal insulation",
"Ffr+óm bren, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "Timber frame, as built, partial insulation (assumed)",
"WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)": "System built, as built, insulated (assumed)",
"WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio": "System built, as built, insulated",
"WediGÇÖu hadeiladu yn +¦l system, gydag inswleiddio allanol": "System built, with external insulation",
"Briciau solet, gydag inswleiddio mewnol": "Solid brick, with internal insulation",
"WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol (rhagdybiaeth)": "System built, as built, partial insulation (assumed)",
"WediGÇÖu hadeiladu yn +¦l system, fel yGÇÖu hadeiladwyd, inswleiddio rhannol": "System built, as built, partial insulation",
"Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio (rhagdybiaeth)": "Timber frame, as built, no insulation (assumed)",
"Ffr+óm bren, fel yGÇÖu hadeiladwyd, dim inswleiddio": "Timber frame, as built, no insulation",
"Tywodfaen, gydag inswleiddio allanol": "Sandstone or limestone, with external insulation",
"Waliau ceudod, gydag inswleiddio allanol": "Cavity wall, with external insulation",
"Briciau solet, gydag inswleiddio allanol": "Solid brick, with external insulation",
# Add in some corrections:
'Co with external insulation': 'Cob, with external insulation',
'Cowith external insulation': 'Cob, with external insulation',
"Co with external insulation": "Cob, with external insulation",
"Cowith external insulation": "Cob, with external insulation",
}
DEFAULT_KEYS = [
'thermal_transmittance', 'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
'is_solid_brick', 'is_system_built', 'is_timber_frame', 'is_granite_or_whinstone',
'is_as_built', 'is_cob', 'is_assumed', 'is_sandstone_or_limestone',
'insulation_thickness', 'external_insulation', 'internal_insulation'
"thermal_transmittance",
"thermal_transmittance_unit",
"is_cavity_wall",
"is_filled_cavity",
"is_solid_brick",
"is_system_built",
"is_timber_frame",
"is_granite_or_whinstone",
"is_as_built",
"is_cob",
"is_assumed",
"is_sandstone_or_limestone",
"insulation_thickness",
"external_insulation",
"internal_insulation",
]
CORRECTIONS = {
@ -98,7 +100,9 @@ class WallAttributes(Definitions):
:return:
"""
uvalue_search = re.search(r"Trawsyriannedd thermol cyfartalog (\d+\.?\d*)", self.description)
uvalue_search = re.search(
r"Trawsyriannedd thermol cyfartalog (\d+\.?\d*)", self.description
)
if uvalue_search:
uvalue = uvalue_search.group(1)
@ -123,7 +127,9 @@ class WallAttributes(Definitions):
result, description = extract_thermal_transmittance(result, description)
# wall type
result, description = extract_component_types(result, description, list_of_components=self.WALL_TYPES)
result, description = extract_component_types(
result, description, list_of_components=self.WALL_TYPES
)
# Handle some edge cases
if "sandstone" in description and not result["is_sandstone_or_limestone"]:
@ -137,18 +143,18 @@ class WallAttributes(Definitions):
"partial insulation": "below average",
"no insulation": "none",
"additional insulation": "above average",
"insulated": "average"
"insulated": "average",
}
for key, value in thickness_map.items():
if key in description:
result['insulation_thickness'] = value
result["insulation_thickness"] = value
break
else:
result['insulation_thickness'] = None
result["insulation_thickness"] = None
# insulation type
result['external_insulation'] = 'external insulation' in description
result['internal_insulation'] = 'internal insulation' in description
result["external_insulation"] = "external insulation" in description
result["internal_insulation"] = "internal insulation" in description
if result["is_filled_cavity"]:
# If it has a filled cavity + internal/external insulation, it's deemed to have above average insulation
@ -159,7 +165,11 @@ class WallAttributes(Definitions):
else:
result["insulation_thickness"] = "average"
if result["is_cavity_wall"] & result["is_as_built"] & (result["insulation_thickness"] == "average"):
if (
result["is_cavity_wall"]
& result["is_as_built"]
& (result["insulation_thickness"] == "average")
):
result["is_filled_cavity"] = True
return result

File diff suppressed because it is too large Load diff

View file

@ -205,7 +205,7 @@ def get_wall_u_value(
mapped_value = wall_uvalues_df[
wall_uvalues_df["Wall_type"] == mapped_description
][age_band].values[0]
][age_band].values[0]
if pd.isnull(mapped_value) and "Park home" in mapped_description:
# We don't know enough in this case so we default to 0
@ -563,7 +563,7 @@ def get_floor_u_value(
insulation_lookup = s11[
s11["Age_band"].str.contains(age_band) & s11["Floor_construction"]
== floor_type
]
]
if insulation_lookup.empty:
insulation_thickness = 0
else: