mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Adding pruning and processing of descriptions
This commit is contained in:
parent
c2fcd0041f
commit
e0019662c9
4 changed files with 166 additions and 16 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
|
|||
|
||||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import msgpack
|
||||
|
||||
from pathlib import Path
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
|
|
@ -9,20 +10,42 @@ from model_data.simulation_system.core.Settings import (
|
|||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
EARLIEST_EPC_DATE
|
||||
EARLIEST_EPC_DATE,
|
||||
CARBON_RESPONSE,
|
||||
)
|
||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
def get_cleaned():
|
||||
"""
|
||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||
descriptions for the epc dataset
|
||||
|
||||
This data is stored in MessagePack format and therefore needs to be decoded
|
||||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
# Data glossary:
|
||||
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
||||
|
||||
cleaned_lookup = get_cleaned()
|
||||
|
||||
# List all subdirectories
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
|
|
@ -84,7 +107,7 @@ def app():
|
|||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = modified_property_data[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
|
|
@ -104,24 +127,29 @@ def app():
|
|||
if gets_better:
|
||||
starting_sap = earliest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = earliest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
if gets_better:
|
||||
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = latest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
property_model_data.append(
|
||||
|
|
@ -129,8 +157,10 @@ def app():
|
|||
"UPRN": uprn,
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
"STARTING_SAP": starting_sap,
|
||||
"STARTING_HEAT_DEMAND": starting_heat_demand,
|
||||
"CARBON_CHANGE": carbon_change,
|
||||
"SAP_STARTING": starting_sap,
|
||||
"HEAT_DEMAND_STARTING": starting_heat_demand,
|
||||
"CARBON_STARTING": starting_carbon,
|
||||
**fixed_data,
|
||||
**features.to_dict(),
|
||||
}
|
||||
|
|
@ -152,6 +182,125 @@ def app():
|
|||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
||||
# We look for key building fabric features that have changed from one EPC to the next.
|
||||
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
||||
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
||||
# is low
|
||||
# We also replace descriptions with their cleaned variants
|
||||
|
||||
def process_and_prune_desriptions(df, cleaned_lookup):
|
||||
|
||||
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
|
||||
# estimates, we well as estimated U-values
|
||||
|
||||
cols_to_drop = {
|
||||
"walls": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
|
||||
'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
|
||||
'is_sandstone_or_limestone', 'insulation_thickness',
|
||||
'external_insulation', 'internal_insulation',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
||||
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
||||
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
||||
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
||||
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
|
||||
'external_insulation_ENDING', 'internal_insulation_ENDING',
|
||||
],
|
||||
"floor": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
|
||||
'is_to_external_air', 'is_suspended', 'is_solid',
|
||||
'another_property_below', 'insulation_thickness', 'no_data',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
|
||||
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
|
||||
'another_property_below_ENDING', 'insulation_thickness_ENDING',
|
||||
'no_data_ENDING',
|
||||
],
|
||||
"roof": [
|
||||
'original_description', 'clean_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
|
||||
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
||||
'has_dwelling_above', 'is_valid', 'insulation_thickness',
|
||||
'original_description_ENDING', 'clean_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
|
||||
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
|
||||
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
|
||||
'insulation_thickness_ENDING',
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
for component in ["walls", "floor", "roof"]:
|
||||
component_upper = component.upper()
|
||||
|
||||
df = df.merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_STARTING",
|
||||
right_on="original_description",
|
||||
).merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_ENDING",
|
||||
right_on="original_description",
|
||||
suffixes=("", "_ENDING")
|
||||
)
|
||||
|
||||
if component == "walls":
|
||||
# We make sure the wall construction hasn't changed
|
||||
df = df[
|
||||
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
||||
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
||||
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
||||
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
||||
(df["is_cob"] == df["is_cob_ENDING"]) &
|
||||
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
||||
]
|
||||
elif component == "floor":
|
||||
df = df[
|
||||
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
||||
(df["is_solid"] == df["is_solid_ENDING"]) &
|
||||
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
||||
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
|
||||
]
|
||||
else:
|
||||
df = df[
|
||||
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
||||
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
||||
(df["is_loft"] == df["is_loft_ENDING"]) &
|
||||
(df["is_flat"] == df["is_flat_ENDING"]) &
|
||||
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
||||
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
||||
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
||||
]
|
||||
|
||||
# Drop the binary indicators and replace the original description with the cleaned version
|
||||
|
||||
# Drop original cols
|
||||
original_cols = [
|
||||
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
||||
]
|
||||
|
||||
df = df.drop(
|
||||
columns=cols_to_drop[component] + original_cols
|
||||
).rename(
|
||||
columns={
|
||||
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
|
||||
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
|
||||
}
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
||||
|
||||
dataset.append(data_by_urpn_df)
|
||||
|
||||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue