Merge pull request #245 from Hestia-Homes/sap-model-updates

Sap model updates
This commit is contained in:
KhalimCK 2023-10-18 10:28:48 +11:00 committed by GitHub
commit f326de6ea0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 179 additions and 35 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -569,3 +569,31 @@ class DataProcessor:
df[col] = df[col].fillna("Unknown")
return df
@staticmethod
def clean_efficiency_variables(df):
"""
These is scope to clean this by the model per corresponding description.
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
fill in the missing values with this.
When looking at this initially, there are a large volume of records with missing energy efficiency
values and therefore a simpler approach was taken just to test including these variables
:param df:
:return:
"""
missings = pd.isnull(df).sum()
missings = missings[missings >= 1]
if len(missings) == 0:
return df
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("ENERGY_EFF")):
raise ValueError("Non efficiency columns are missing")
for m in missings.index:
df[m] = df[m].fillna("NO_RATING")
return df

View file

@ -12,6 +12,10 @@ from etl.epc.settings import (
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
CARBON_RESPONSE,
CORE_COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
POTENTIAL_COLUMNS,
MINIMUM_FLOOR_HEIGHT
)
from etl.epc.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
@ -254,6 +258,13 @@ def make_uvalues(df):
# Roof
# ~~~~~~~~~~~~~~~~~~
if x["has_dwelling_above"]:
if x["roof_thermal_transmittance"] != 0:
raise ValueError("Should have 0 u-value for roof")
if x["roof_thermal_transmittance_ENDING"] != 0:
raise ValueError("Should have 0 u-value for roof")
starting_roof_uvalue = x["roof_thermal_transmittance"]
if pd.isnull(starting_roof_uvalue):
starting_roof_uvalue = get_roof_u_value(
@ -297,6 +308,11 @@ def make_uvalues(df):
wall_type = get_wall_type(**x)
if x["another_property_below"]:
if x["floor_thermal_transmittance"] != 0:
raise ValueError("Should have 0 u-value for floor")
if x["floor_thermal_transmittance_ENDING"] != 0:
raise ValueError("Should have 0 u-value for floor")
starting_floor_uvalue, ending_floor_uvalue = 0, 0
else:
starting_floor_uvalue = x["floor_thermal_transmittance"]
@ -363,6 +379,25 @@ def make_uvalues(df):
return df
def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
"""
For a list of columns, check if the earliest and latest record are the same
If they are the same, we indicate this, because we have example of SAP scores changing
without any feature changes
:param earliest_record: pd.Series
:param latest_record: pd.Series
:param columns: list of columns to compare
:return: boolean indicating whether or not all features are the same
"""
all_equal = True
for col in columns:
if earliest_record[col] != latest_record[col]:
return False
if all_equal:
return True
def app():
# Get all the files in the directory
@ -376,6 +411,8 @@ def app():
dataset = []
cleaning_dataset = []
# Keep track of the all equals
all_equal_rows = []
for directory in tqdm(directories):
@ -422,7 +459,9 @@ def app():
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
COMPONENT_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + [
"LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE
]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -439,6 +478,8 @@ def app():
# Check if the sap gets better or worse
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
@ -452,8 +493,8 @@ def app():
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
@ -467,12 +508,23 @@ def app():
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
if rdsap_change == 0:
continue
all_equal = compare_records(
earliest_record=earliest_record,
latest_record=latest_record,
columns=CORE_COMPONENT_FEATURES
)
if all_equal:
# Keep track of this for the moment so we can analyse
all_equal_rows.append({"uprn": uprn, "directory_name": directory.name})
continue
features = pd.concat([starting_record, ending_record])
property_model_data.append(
@ -487,6 +539,10 @@ def app():
"HEAT_DEMAND_ENDING": ending_heat_demand,
"CARBON_STARTING": starting_carbon,
"CARBON_ENDING": ending_carbon,
"POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
"ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
"ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
"CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
**fixed_data,
**features.to_dict(),
}
@ -496,8 +552,6 @@ def app():
data_by_urpn_df = pd.DataFrame(data_by_urpn)
# Add some temporal features - we look at the days from the standard starting point in time
# for the starting and ending date so all records are from a fixed point
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_STARTING"]
)
@ -508,6 +562,8 @@ def app():
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
# We look for key building fabric features that have changed from one EPC to the next.
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
@ -541,6 +597,8 @@ def app():
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
cleaning_dataset.append(cleaning_averages)
print("Final all equal count: %s" % str(len(all_equal_rows)))
# Store cleaning dataset in s3 as a parquet file
cleaning_dataset = pd.concat(cleaning_dataset)
save_dataframe_to_s3_parquet(
@ -567,6 +625,14 @@ def app():
file_key="sap_change_model/dataset.parquet",
)
# Store all_equal_rows
all_equal_rows = pd.DataFrame(all_equal_rows)
save_dataframe_to_s3_parquet(
df=all_equal_rows,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/all_equal_rows.parquet",
)
if __name__ == "__main__":
app()

View file

@ -85,8 +85,7 @@ FIXED_FEATURES = [
"FIXED_LIGHTING_OUTLETS_COUNT",
]
COMPONENT_FEATURES = [
"TRANSACTION_TYPE",
CORE_COMPONENT_FEATURES = [
"WALLS_DESCRIPTION",
"FLOOR_DESCRIPTION",
"LIGHTING_DESCRIPTION",
@ -96,21 +95,49 @@ COMPONENT_FEATURES = [
"MAIN_FUEL",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF", # Not sure if this is relevant
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"WINDOWS_DESCRIPTION",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"MAINHEATCONT_DESCRIPTION",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
]
EFFICIENCY_FEATURES = [
'HOT_WATER_ENERGY_EFF',
'FLOOR_ENERGY_EFF',
'WINDOWS_ENERGY_EFF',
'WALLS_ENERGY_EFF',
'SHEATING_ENERGY_EFF',
'ROOF_ENERGY_EFF',
'MAINHEAT_ENERGY_EFF',
'MAINHEATC_ENERGY_EFF',
'LIGHTING_ENERGY_EFF'
]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
POTENTIAL_COLUMNS = [
'POTENTIAL_ENERGY_RATING',
'POTENTIAL_ENERGY_EFFICIENCY',
'ENVIRONMENT_IMPACT_POTENTIAL',
'ENERGY_CONSUMPTION_POTENTIAL',
'CO2_EMISSIONS_POTENTIAL',
# We don't include cost features for the moment
# 'LIGHTING_COST_POTENTIAL',
# 'HEATING_COST_POTENTIAL',
# 'HOT_WATER_COST_POTENTIAL'
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
@ -253,3 +280,7 @@ ENDING_SUFFIX_COMPONENT_COLS = [
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
]
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
# filter out any homes with a floor height below this
MINIMUM_FLOOR_HEIGHT = 1.65

View file

@ -107,4 +107,8 @@ class FloorAttributes(Definitions):
else:
result['insulation_thickness'] = None
if result["another_property_below"]:
result["thermal_transmittance"] = 0
result["thermal_transmittance_unit"] = 'w/m-¦k'
return result

View file

@ -138,4 +138,8 @@ class RoofAttributes(Definitions):
if "insulation_thickness" not in result:
result['insulation_thickness'] = None
if result["has_dwelling_above"]:
result["thermal_transmittance"] = 0
result["thermal_transmittance_unit"] = 'w/m-¦k'
return result

View file

@ -133,4 +133,13 @@ class WallAttributes(Definitions):
result['external_insulation'] = 'external insulation' in description
result['internal_insulation'] = 'internal insulation' in description
if result["is_filled_cavity"]:
# If it has a filled cavity + internal/external insulation, it's deemed to have above average insulation
if result["external_insulation"]:
result["insulation_thickness"] = "above average"
elif result["internal_insulation"]:
result["insulation_thickness"] = "above average"
else:
result["insulation_thickness"] = "average"
return result

View file

@ -1,14 +1,14 @@
clean_floor_cases = [
{'original_description': '(another dwelling below)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_assumed': False, 'is_to_unheated_space': False,
{'original_description': '(another dwelling below)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k", 'is_assumed': False, 'is_to_unheated_space': False,
'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'insulation_thickness': None,
"another_property_below": True},
{'original_description': '(anheddiad arall islaw)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_assumed': False, 'is_to_unheated_space': False,
{'original_description': '(anheddiad arall islaw)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k", 'is_assumed': False, 'is_to_unheated_space': False,
'is_to_external_air': False, 'is_suspended': False, 'is_solid': False, 'insulation_thickness': None,
"another_property_below": True},
{'original_description': '(other premises below)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None,
{'original_description': '(other premises below)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k",
'is_assumed': False, 'is_to_unheated_space': False, 'is_to_external_air': False, 'is_suspended': False,
'is_solid': False, 'insulation_thickness': None,
"another_property_below": True},
@ -342,8 +342,8 @@ clean_floor_cases = [
{'original_description': 'To unheated space, no insulation (assumed)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_assumed': True, 'is_to_unheated_space': True, 'is_to_external_air': False,
'is_suspended': False, 'is_solid': False, 'insulation_thickness': 'none', "another_property_below": False},
{'original_description': '(eiddo arall islaw)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None,
{'original_description': '(eiddo arall islaw)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k",
'is_assumed': False, 'is_to_unheated_space': False, 'is_to_external_air': False, 'is_suspended': False,
'is_solid': False, 'insulation_thickness': None,
"another_property_below": True},

View file

@ -1,10 +1,11 @@
clean_roof_test_cases = [
{'original_description': '(another dwelling above)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, 'is_flat': False,
{'original_description': '(another dwelling above)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k", 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
'is_flat': False,
'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': True, 'is_valid': True,
'insulation_thickness': None},
{'original_description': '(other premises above)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None,
{'original_description': '(other premises above)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k",
'is_pitched': False, 'is_roof_room': False, 'is_loft': False, 'is_flat': False, 'is_thatched': False,
'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': True, 'is_valid': True,
'insulation_thickness': None},
@ -362,8 +363,9 @@ clean_roof_test_cases = [
'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': False, 'is_flat': False,
'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, 'has_dwelling_above': False, 'is_valid': True,
'insulation_thickness': 'average'},
{'original_description': '(eiddo arall uwchben)', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, 'is_flat': False,
{'original_description': '(eiddo arall uwchben)', 'thermal_transmittance': 0,
'thermal_transmittance_unit': "w/m-¦k", 'is_pitched': False, 'is_roof_room': False, 'is_loft': False,
'is_flat': False,
'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, 'has_dwelling_above': True, 'is_valid': True,
'insulation_thickness': None},
{'original_description': 'Ar oleddf, inswleiddio cyfyngedig (rhagdybiaeth)', 'thermal_transmittance': None,

View file

@ -567,17 +567,17 @@ wall_cases = [
{'original_description': 'Cavity wall, filled cavity', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False,
'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': False,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': None,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': "average",
'external_insulation': False, 'internal_insulation': False},
{'original_description': 'Cavity wall, filled cavity and external insulation', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False,
'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': False,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'average',
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'above average',
'external_insulation': True, 'internal_insulation': False},
{'original_description': 'Cavity wall, filled cavity and internal insulation', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False,
'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': False,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'average',
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'above average',
'external_insulation': False, 'internal_insulation': True},
{'original_description': 'Cavity wall, with external insulation', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': False, 'is_solid_brick': False,
@ -723,7 +723,7 @@ wall_cases = [
{'original_description': 'Waliau ceudod, ceudod wediGÇÖi lenwi', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False,
'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': False,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': None,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': "average",
'external_insulation': False, 'internal_insulation': False},
{'original_description': 'Waliau ceudod, fel yGÇÖu hadeiladwyd, wediGÇÖu hinswleiddio (rhagdybiaeth)',
'thermal_transmittance': None,
@ -778,7 +778,7 @@ wall_cases = [
{'original_description': 'Waliau ceudod, ynysydd allanol a llenwi ceudod', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': True, 'is_filled_cavity': True, 'is_solid_brick': False,
'is_system_built': False, 'is_timber_frame': False, 'is_granite_or_whinstone': False, 'is_as_built': False,
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'average',
'is_cob': False, 'is_assumed': False, 'is_sandstone_or_limestone': False, 'insulation_thickness': 'above average',
'external_insulation': True, 'internal_insulation': False},
{'original_description': 'Gwenithfaen neu risgraig, gydag inswleiddio mewnol', 'thermal_transmittance': None,
'thermal_transmittance_unit': None, 'is_cavity_wall': False, 'is_filled_cavity': False, 'is_solid_brick': False,

View file

@ -75,8 +75,8 @@ class TestRoofAttributes:
"is_assumed": False,
"is_flat": False,
"is_thatched": False,
"thermal_transmittance": None,
"thermal_transmittance_unit": None,
"thermal_transmittance": 0,
"thermal_transmittance_unit": "w/m-¦k",
}
for k in expected_output: