Model/etl/epc/settings.py
2025-11-28 06:20:47 +00:00

558 lines
17 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Using a simply python file as settings for now
# TODO: migrate to dynaconf
from pathlib import Path
DATA_ANOMALY_MATCHES = {
# Invalid reports are where the value provided is out of bounds, e.g. a negative energy rating of -1199 or a
# non-integer, there is no valid energy band for this, so it is marked as INVALID!
"INVALID",
"INVALID!",
# When the energy certificate was first lodged on the register there was no requirement to lodge this data
# item, i.e. a non-mandatory item.
"NO DATA!",
"NODATA!",
# When the energy certificate was first lodged on the register there was no requirement to lodge this data item,
# i.e.a non - mandatory item.
"N/A",
# A value generated by the register to account for a data item that was not mandatory when the lodgement of
# the energy certificate occurred. When the data item became mandatory the register operator, for backwards
# compatibility purposes, populated the data field with a value of not recorded to ensure that the energy
# certificate retrieval process is successfully completed. Mandatory data items cannot be applied
# retrospectively to energy certificates lodged before the date of the change.
"Not recorded",
"Not Recorded",
# The data also contains DECs with an operational rating of 9999 (a default DEC). The production of a
# default DEC value was allowed to enable building occupiers, with poor quality or no energy data,
# the opportunity to comply with the regulations. From April 2011 the ability to lodge a default DEC was no
# longer allowed.
"9999",
# The Building Emission Rate (BER) data field for non-domestic buildings may contain a blank value. The BER
# was only lodged on the register from 7 March 2010.
"Blank"
# There are currently just over 8,600 records where the local authority identifier is null. This is due to
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
# etc). These records are being published for completeness. An ongoing process to manage these manually added
# addresses will take time to develop to deal with these and future anomalies.
#
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
# the description fields for floor, roof and wall. For the purposes of this data release only the information
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
# value in this first field it means that sometimes the first field in a multiple entry description field may
# contain a null value. A resolution to correct these anomalies will be considered for future data releases.
"NULL",
# We sometimes see fields populated with just an empty string.
"",
# We sometimes find None values - particulatly when we produce an estimated EPC
None,
# An older value which rarely shows up but has been seen in the data.
"UNKNOWN",
#
"Unknown",
# Observed error cases
"(error), (error)",
"error , error",
"Description",
"description",
"Undefined Welsh description for crtrl code 2113",
"undefined welsh description for crtrl code 2113",
"Hot water system",
"hot water system",
"Heating system",
"heating system",
}
# Add the post_sap10 date to indicate if the epc is post sap10
POST_SAP10_DATE = "2025-06-22"
DATA_ANOMALY_SUBSTRINGS = {
# Where values in a pick list that have been superseded by another value. For example, where a value for
# pitched roof has been replaced by three sub-categories of pitched roof. The original value is retained
# but for backward compatibility only it is appended to ensure that the energy certificate retrieval
# process can be successfully completed. Replacement data items cannot be applied retrospectively to energy
# certificates lodged on the register before the date of the change.
"for backward compatibility only"
}
METRIC_FILENAME = "metrics.csv"
OPTIMISE_METRIC = "mean_absolute_error"
BEST_MODEL_COLUMN_NAME = "best_model"
# TODO: remove these setting elsewhere for CML
RESIDUAL_TRUE_LABEL = "true"
RESIDUAL_PREDICTION_LABEL = "pred"
RESIDUAL_FILE = "residual.png"
SEABORN_RESIDUAL_AXIS_FONTSIZE = 12
SEABORN_RESIDUAL_TITLE_FONTSIZE = 22
SEABORN_RESIDUAL_STYLE = "whitegrid"
SEABORN_RESIDUAL_ASPECT_RATIO = "equal"
SEABORN_RESIDUAL_PLOT_DPI = 120
SEABORN_RESIDUAL_RANGE = [-100, 100]
SEABORN_RESIDUAL_LINE_COLOUR = "black"
SEABORN_RESIDUAL_LINE_WIDTH = 1
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
MODEL_HYPERPARAMETERS = {
"autogluon": {
"problem_type": "regression",
"eval_metric": "mean_absolute_error",
"time_limit": 45,
"presets": "medium_quality",
"excluded_model_types": None,
}
}
TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S"
RANDOM_SEED = 0
SUBSAMPLE_FACTOR = 200
TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet"
TEST_DATA_NAME = "test_data.parquet"
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = "prediction.json"
METADATA_FILE = "metadata.json"
MODEL_FOLDER = "model"
METRICS_FOLDER = "metrics"
DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
COST_FEATURES = [
"LIGHTING_COST_CURRENT",
"HEATING_COST_CURRENT",
"HOT_WATER_COST_CURRENT",
]
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"FIXED_LIGHTING_OUTLETS_COUNT",
]
COLUMNS_TO_MERGE_ON = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
]
FULLY_GLAZED_DESCRIPTIONS = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
FIXED_FEATURES = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"NUMBER_HABITABLE_ROOMS",
"CONSTITUENCY",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
]
CORE_COMPONENT_FEATURES = [
"WALLS_DESCRIPTION",
"FLOOR_DESCRIPTION",
"LIGHTING_DESCRIPTION",
"ROOF_DESCRIPTION",
"MAINHEAT_DESCRIPTION",
"HOTWATER_DESCRIPTION",
"MAIN_FUEL",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"WINDOWS_DESCRIPTION",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"MAINHEATCONT_DESCRIPTION",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
]
EFFICIENCY_FEATURES = [
"HOT_WATER_ENERGY_EFF",
"FLOOR_ENERGY_EFF",
"WINDOWS_ENERGY_EFF",
"WALLS_ENERGY_EFF",
"SHEATING_ENERGY_EFF",
"ROOF_ENERGY_EFF",
"MAINHEAT_ENERGY_EFF",
"MAINHEATC_ENERGY_EFF",
"LIGHTING_ENERGY_EFF",
]
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
POST_SAP10_FEATURE = ["is_post_sap10"]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
POTENTIAL_COLUMNS = [
"POTENTIAL_ENERGY_EFFICIENCY",
"ENVIRONMENT_IMPACT_POTENTIAL",
"ENERGY_CONSUMPTION_POTENTIAL",
"CO2_EMISSIONS_POTENTIAL",
# We don't include cost features for the moment
# 'LIGHTING_COST_POTENTIAL',
# 'HEATING_COST_POTENTIAL',
# 'HOT_WATER_COST_POTENTIAL'
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
]
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
EARLIEST_EPC_DATE = "2014-08-01"
IGNORED_TRANSACTION_TYPES = "new dwelling"
IGNORED_FLOOR_LEVELS = ["top floor", "mid floor"]
IGNORED_PROPERTY_TYPES = "Park home"
IGNORED_TENURES = [
"Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be used "
"for an existing dwelling"
]
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
BUILT_FORM_REMAP = {
"Enclosed End-Terrace": "End-Terrace",
"Enclosed Mid-Terrace": "Mid-Terrace",
}
DATA_PROCESSOR_SETTINGS = {
"low_memory": False,
"epc_minimum_count": 1,
"column_mappings": {
"UPRN": [int, str],
"NUMBER_HEATED_ROOMS": [float],
"NUMBER_HABITABLE_ROOMS": [float],
},
}
# This has a manual mapping of the column types required
COLUMNTYPES = {
"UPRN": "object",
"TOTAL_FLOOR_AREA": "float64",
"FLOOR_HEIGHT": "float64",
"PROPERTY_TYPE": "object",
"BUILT_FORM": "object",
"CONSTITUENCY": "object",
"NUMBER_HABITABLE_ROOMS": "float64",
"NUMBER_HEATED_ROOMS": "float64",
"FIXED_LIGHTING_OUTLETS_COUNT": "float64",
"CONSTRUCTION_AGE_BAND": "object",
"TRANSACTION_TYPE": "object",
"WALLS_DESCRIPTION": "object",
"FLOOR_DESCRIPTION": "object",
"LIGHTING_DESCRIPTION": "object",
"ROOF_DESCRIPTION": "object",
"MAINHEAT_DESCRIPTION": "object",
"HOTWATER_DESCRIPTION": "object",
"MAIN_FUEL": "object",
"MECHANICAL_VENTILATION": "object",
"SECONDHEAT_DESCRIPTION": "object",
"ENERGY_TARIFF": "object",
"SOLAR_WATER_HEATING_FLAG": "object",
"PHOTO_SUPPLY": "float64",
"WINDOWS_DESCRIPTION": "object",
"GLAZED_TYPE": "object",
"MULTI_GLAZE_PROPORTION": "float64",
"LOW_ENERGY_LIGHTING": "float64",
"NUMBER_OPEN_FIREPLACES": "float64",
"MAINHEATCONT_DESCRIPTION": "object",
"EXTENSION_COUNT": "float64",
"LODGEMENT_DATE": "object",
**dict(
zip(
EFFICIENCY_FEATURES,
[
"object",
]
* len(EFFICIENCY_FEATURES),
)
),
**dict(
zip(
POTENTIAL_COLUMNS,
[
"float64",
]
* len(POTENTIAL_COLUMNS),
)
),
}
# For modelling, we don't allow records with more than 100 SAP points
MAX_SAP_SCORE = 100
fill_na_map = {
# There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from
# a community network" that could be clustered with unknown fuel
"MAIN_FUEL": "UNKNOWN",
"MECHANICAL_VENTILATION": "Unknown",
"SECONDHEAT_DESCRIPTION": "None",
"ENERGY_TARIFF": "Unknown",
# We set solar water heating flag to N - we could investigate using a different category entirely
"SOLAR_WATER_HEATING_FLAG": "N",
"GLAZED_TYPE": "not defined",
"MULTI_GLAZE_PROPORTION": 0,
"LOW_ENERGY_LIGHTING": 0,
"MAINHEATCONT_DESCRIPTION": "Unknown",
"EXTENSION_COUNT": 0,
"NUMBER_OPEN_FIREPLACES": 0,
}
################################################################################################
# These are the features we need for scoring
# We'll likely change how we do this in the future
################################################################################################
STARTING_SUFFIX_COMPONENT_COLS = [
"SAP",
"HEAT_DEMAND",
"CARBON",
"TRANSACTION_TYPE",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"DAYS_TO",
"estimated_perimeter",
]
NO_SUFFIX_COMPONENT_COLS = [
"walls_thermal_transmittance",
"is_cavity_wall",
"is_filled_cavity",
"is_solid_brick",
"is_system_built",
"is_timber_frame",
"is_granite_or_whinstone",
"is_as_built",
"is_cob",
"is_sandstone_or_limestone",
"is_park_home",
"walls_insulation_thickness",
"external_insulation",
"internal_insulation",
"floor_thermal_transmittance",
"is_to_unheated_space",
"is_to_external_air",
"is_suspended",
"is_solid",
"another_property_below",
"floor_insulation_thickness",
"roof_thermal_transmittance",
"is_pitched",
"is_roof_room",
"is_loft",
"is_flat",
"is_thatched",
"is_at_rafters",
"has_dwelling_above",
"roof_insulation_thickness",
"heater_type",
"system_type",
"thermostat_characteristics",
"heating_scope",
"energy_recovery",
"hotwater_tariff_type",
"extra_features",
"chp_systems",
"distribution_system",
"no_system_present",
"appliance",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
"glazing_type",
"fuel_type",
"main-fuel_tariff_type",
"is_community",
"no_individual_heating_or_community_network",
"complex_fuel_type",
]
ENDING_SUFFIX_COMPONENT_COLS = [
"SAP",
"HEAT_DEMAND",
"CARBON",
"TRANSACTION_TYPE",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"DAYS_TO",
"walls_thermal_transmittance",
"is_park_home",
"walls_insulation_thickness",
"external_insulation",
"internal_insulation",
"floor_thermal_transmittance",
"floor_insulation_thickness",
"roof_thermal_transmittance",
"roof_insulation_thickness",
"heater_type",
"system_type",
"thermostat_characteristics",
"heating_scope",
"energy_recovery",
"hotwater_tariff_type",
"extra_features",
"chp_systems",
"distribution_system",
"no_system_present",
"appliance",
"has_radiators",
"has_fan_coil_units",
"has_pipes_in_screed_above_insulation",
"has_pipes_in_insulated_timber_floor",
"has_pipes_in_concrete_slab",
"has_boiler",
"has_air_source_heat_pump",
"has_room_heaters",
"has_electric_storage_heaters",
"has_warm_air",
"has_electric_underfloor_heating",
"has_electric_ceiling_heating",
"has_community_scheme",
"has_ground_source_heat_pump",
"has_no_system_present",
"has_portable_electric_heaters",
"has_water_source_heat_pump",
"has_electric_heat_pump",
"has_micro-cogeneration",
"has_solar_assisted_heat_pump",
"has_exhaust_source_heat_pump",
"has_community_heat_pump",
"has_electric",
"has_mains_gas",
"has_wood_logs",
"has_coal",
"has_oil",
"has_wood_pellets",
"has_anthracite",
"has_dual_fuel_mineral_and_wood",
"has_smokeless_fuel",
"has_lpg",
"has_b30k",
"has_electricaire",
"has_assumed_for_most_rooms",
"has_underfloor_heating",
"thermostatic_control",
"charging_system",
"switch_system",
"no_control",
"dhw_control",
"community_heating",
"multiple_room_thermostats",
"auxiliary_systems",
"trvs",
"rate_control",
"glazing_type",
"fuel_type",
"main-fuel_tariff_type",
"is_community",
"no_individual_heating_or_community_network",
"complex_fuel_type",
"estimated_perimeter",
]
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
# filter out any homes with a floor height below this
MINIMUM_FLOOR_HEIGHT = 1.65