Adding pruning and processing of descriptions

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-14 16:23:05 +01:00
parent c2fcd0041f
commit e0019662c9
4 changed files with 166 additions and 16 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
def ordinal(n):

View file

@ -1,5 +1,6 @@
import pandas as pd
from tqdm import tqdm
import msgpack
from pathlib import Path
from model_data.simulation_system.core.Settings import (
@ -9,20 +10,42 @@ from model_data.simulation_system.core.Settings import (
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
EARLIEST_EPC_DATE
EARLIEST_EPC_DATE,
CARBON_RESPONSE,
)
from model_data.simulation_system.core.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def app():
# Get all the files in the directory
# Data glossary:
# https://epc.opendatacommunities.org/docs/guidance#glossary
cleaned_lookup = get_cleaned()
# List all subdirectories
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
@ -84,7 +107,7 @@ def app():
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = modified_property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -104,24 +127,29 @@ def app():
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = earliest_record[CARBON_RESPONSE]
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
if rdsap_change == 0:
continue
if gets_better:
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
starting_carbon = latest_record[CARBON_RESPONSE]
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
if rdsap_change == 0:
continue
features = pd.concat([starting_record, ending_record])
property_model_data.append(
@ -129,8 +157,10 @@ def app():
"UPRN": uprn,
"RDSAP_CHANGE": rdsap_change,
"HEAT_DEMAND_CHANGE": heat_demand_change,
"STARTING_SAP": starting_sap,
"STARTING_HEAT_DEMAND": starting_heat_demand,
"CARBON_CHANGE": carbon_change,
"SAP_STARTING": starting_sap,
"HEAT_DEMAND_STARTING": starting_heat_demand,
"CARBON_STARTING": starting_carbon,
**fixed_data,
**features.to_dict(),
}
@ -152,6 +182,125 @@ def app():
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions
# We look for key building fabric features that have changed from one EPC to the next.
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
# is low
# We also replace descriptions with their cleaned variants
def process_and_prune_desriptions(df, cleaned_lookup):
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
# estimates, we well as estimated U-values
cols_to_drop = {
"walls": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
'is_sandstone_or_limestone', 'insulation_thickness',
'external_insulation', 'internal_insulation',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
'is_solid_brick_ENDING', 'is_system_built_ENDING',
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
'external_insulation_ENDING', 'internal_insulation_ENDING',
],
"floor": [
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'insulation_thickness', 'no_data',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
'another_property_below_ENDING', 'insulation_thickness_ENDING',
'no_data_ENDING',
],
"roof": [
'original_description', 'clean_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'is_valid', 'insulation_thickness',
'original_description_ENDING', 'clean_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
'insulation_thickness_ENDING',
]
}
for component in ["walls", "floor", "roof"]:
component_upper = component.upper()
df = df.merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_STARTING",
right_on="original_description",
).merge(
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
how="left",
left_on=f"{component_upper}_DESCRIPTION_ENDING",
right_on="original_description",
suffixes=("", "_ENDING")
)
if component == "walls":
# We make sure the wall construction hasn't changed
df = df[
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
(df["is_cob"] == df["is_cob_ENDING"]) &
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
]
elif component == "floor":
df = df[
(df["is_suspended"] == df["is_suspended_ENDING"]) &
(df["is_solid"] == df["is_solid_ENDING"]) &
(df["another_property_below"] == df["another_property_below_ENDING"]) &
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
]
else:
df = df[
(df["is_pitched"] == df["is_pitched_ENDING"]) &
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
(df["is_loft"] == df["is_loft_ENDING"]) &
(df["is_flat"] == df["is_flat_ENDING"]) &
(df["is_thatched"] == df["is_thatched_ENDING"]) &
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
]
# Drop the binary indicators and replace the original description with the cleaned version
# Drop original cols
original_cols = [
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
]
df = df.drop(
columns=cols_to_drop[component] + original_cols
).rename(
columns={
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
}
)
return df
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
dataset.append(data_by_urpn_df)
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]