diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..b03b31b1 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
-
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ca0e1cd9 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
-
+
diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py
index 8a03b553..c094c085 100644
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
def ordinal(n):
diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py
index 42317edd..b3961ce1 100644
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@@ -1,5 +1,6 @@
import pandas as pd
from tqdm import tqdm
+import msgpack
from pathlib import Path
from model_data.simulation_system.core.Settings import (
@@ -9,20 +10,42 @@ from model_data.simulation_system.core.Settings import (
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
- EARLIEST_EPC_DATE
+ EARLIEST_EPC_DATE,
+ CARBON_RESPONSE,
)
from model_data.simulation_system.core.DataProcessor import DataProcessor
-from utils.s3 import save_dataframe_to_s3_parquet
+from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
+def get_cleaned():
+ """
+ This function will retrieve the cleaned dataset from s3 which has the cleaned
+ descriptions for the epc dataset
+
+ This data is stored in MessagePack format and therefore needs to be decoded
+ :return:
+ """
+
+ cleaned = read_from_s3(
+ s3_file_name="cleaned_epc_data/cleaned.bson",
+ bucket_name="retrofit-data-dev"
+ )
+
+ cleaned = msgpack.unpackb(cleaned, raw=False)
+
+ return cleaned
+
+
def app():
# Get all the files in the directory
# Data glossary:
# https://epc.opendatacommunities.org/docs/guidance#glossary
+ cleaned_lookup = get_cleaned()
+
# List all subdirectories
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
@@ -84,7 +107,7 @@ def app():
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = modified_property_data[
- COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
+ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
@@ -104,24 +127,29 @@ def app():
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
+ starting_carbon = earliest_record[CARBON_RESPONSE]
+
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
- else:
- starting_sap = latest_record[RDSAP_RESPONSE]
- starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
- rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
- heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+ carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
- if rdsap_change == 0:
- continue
-
- if gets_better:
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
+ starting_sap = latest_record[RDSAP_RESPONSE]
+ starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
+ starting_carbon = latest_record[CARBON_RESPONSE]
+
+ rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
+ heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
+ carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
+
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+ if rdsap_change == 0:
+ continue
+
features = pd.concat([starting_record, ending_record])
property_model_data.append(
@@ -129,8 +157,10 @@ def app():
"UPRN": uprn,
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
- "STARTING_SAP": starting_sap,
- "STARTING_HEAT_DEMAND": starting_heat_demand,
+ "CARBON_CHANGE": carbon_change,
+ "SAP_STARTING": starting_sap,
+ "HEAT_DEMAND_STARTING": starting_heat_demand,
+ "CARBON_STARTING": starting_carbon,
**fixed_data,
**features.to_dict(),
}
@@ -152,6 +182,125 @@ def app():
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions
+ # We look for key building fabric features that have changed from one EPC to the next.
+ # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
+ # remove this record, as it indicates that the quality of the EPC conducted in the first instance
+ # is low
+ # We also replace descriptions with their cleaned variants
+
+ def process_and_prune_desriptions(df, cleaned_lookup):
+
+ # TODO: In a future iteration, we can test using the binary features and the insulation thickness
+ # estimates, we well as estimated U-values
+
+ cols_to_drop = {
+ "walls": [
+ 'original_description', 'thermal_transmittance',
+ 'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
+ 'is_solid_brick', 'is_system_built', 'is_timber_frame',
+ 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
+ 'is_sandstone_or_limestone', 'insulation_thickness',
+ 'external_insulation', 'internal_insulation',
+ 'original_description_ENDING',
+ 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+ 'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
+ 'is_solid_brick_ENDING', 'is_system_built_ENDING',
+ 'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
+ 'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
+ 'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
+ 'external_insulation_ENDING', 'internal_insulation_ENDING',
+ ],
+ "floor": [
+ 'original_description', 'thermal_transmittance',
+ 'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
+ 'is_to_external_air', 'is_suspended', 'is_solid',
+ 'another_property_below', 'insulation_thickness', 'no_data',
+ 'original_description_ENDING',
+ 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+ 'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
+ 'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
+ 'another_property_below_ENDING', 'insulation_thickness_ENDING',
+ 'no_data_ENDING',
+ ],
+ "roof": [
+ 'original_description', 'clean_description', 'thermal_transmittance',
+ 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
+ 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
+ 'has_dwelling_above', 'is_valid', 'insulation_thickness',
+ 'original_description_ENDING', 'clean_description_ENDING',
+ 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
+ 'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
+ 'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
+ 'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
+ 'insulation_thickness_ENDING',
+ ]
+
+ }
+
+ for component in ["walls", "floor", "roof"]:
+ component_upper = component.upper()
+
+ df = df.merge(
+ pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+ how="left",
+ left_on=f"{component_upper}_DESCRIPTION_STARTING",
+ right_on="original_description",
+ ).merge(
+ pd.DataFrame(cleaned_lookup[f"{component}-description"]),
+ how="left",
+ left_on=f"{component_upper}_DESCRIPTION_ENDING",
+ right_on="original_description",
+ suffixes=("", "_ENDING")
+ )
+
+ if component == "walls":
+ # We make sure the wall construction hasn't changed
+ df = df[
+ (df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
+ (df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
+ (df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
+ (df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
+ (df["is_cob"] == df["is_cob_ENDING"]) &
+ (df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
+ ]
+ elif component == "floor":
+ df = df[
+ (df["is_suspended"] == df["is_suspended_ENDING"]) &
+ (df["is_solid"] == df["is_solid_ENDING"]) &
+ (df["another_property_below"] == df["another_property_below_ENDING"]) &
+ (df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
+ ]
+ else:
+ df = df[
+ (df["is_pitched"] == df["is_pitched_ENDING"]) &
+ (df["is_roof_room"] == df["is_roof_room_ENDING"]) &
+ (df["is_loft"] == df["is_loft_ENDING"]) &
+ (df["is_flat"] == df["is_flat_ENDING"]) &
+ (df["is_thatched"] == df["is_thatched_ENDING"]) &
+ (df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
+ (df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
+ ]
+
+ # Drop the binary indicators and replace the original description with the cleaned version
+
+ # Drop original cols
+ original_cols = [
+ f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
+ ]
+
+ df = df.drop(
+ columns=cols_to_drop[component] + original_cols
+ ).rename(
+ columns={
+ "clean_description": f"{component_upper}_DESCRIPTION_STARTING",
+ "clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
+ }
+ )
+
+ return df
+
+ data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
+
dataset.append(data_by_urpn_df)
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]