mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
221 lines
6.9 KiB
Python
221 lines
6.9 KiB
Python
# Using a simply python file as settings for now
|
|
# TODO: migrate to dynaconf
|
|
from pathlib import Path
|
|
|
|
METRIC_FILENAME = "metrics.csv"
|
|
|
|
OPTIMISE_METRIC = "mean_absolute_error"
|
|
BEST_MODEL_COLUMN_NAME = "best_model"
|
|
|
|
# TODO: remove these setting elsewhere for CML
|
|
RESIDUAL_TRUE_LABEL = "true"
|
|
RESIDUAL_PREDICTION_LABEL = "pred"
|
|
RESIDUAL_FILE = "residual.png"
|
|
SEABORN_RESIDUAL_AXIS_FONTSIZE = 12
|
|
SEABORN_RESIDUAL_TITLE_FONTSIZE = 22
|
|
SEABORN_RESIDUAL_STYLE = "whitegrid"
|
|
SEABORN_RESIDUAL_ASPECT_RATIO = "equal"
|
|
SEABORN_RESIDUAL_PLOT_DPI = 120
|
|
SEABORN_RESIDUAL_RANGE = [-100, 100]
|
|
SEABORN_RESIDUAL_LINE_COLOUR = "black"
|
|
SEABORN_RESIDUAL_LINE_WIDTH = 1
|
|
|
|
# Can move to a hyperparmeters file
|
|
# If anything we might want to have a file that can be loaded and sent to this script
|
|
MODEL_HYPERPARAMETERS = {
|
|
"autogluon": {
|
|
"problem_type": "regression",
|
|
"eval_metric": "mean_absolute_error",
|
|
"time_limit": 45,
|
|
"presets": "medium_quality",
|
|
"excluded_model_types": None,
|
|
}
|
|
}
|
|
|
|
TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S"
|
|
|
|
RANDOM_SEED = 0
|
|
SUBSAMPLE_FACTOR = 200
|
|
|
|
TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet"
|
|
TEST_DATA_NAME = "test_data.parquet"
|
|
|
|
REGISTRY_FILE = "model_registry.csv"
|
|
MODEL_DIRECTORY = "model_directory"
|
|
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
|
PREDICTION_LOCATION = Path("predictions")
|
|
PREDICTION_FILE = "prediction.json"
|
|
METADATA_FILE = "metadata.json"
|
|
MODEL_FOLDER = "model"
|
|
METRICS_FOLDER = "metrics"
|
|
DEPLOYMENT_FOLDER = "deployment"
|
|
|
|
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
|
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
|
|
|
AVERAGE_FIXED_FEATURES = [
|
|
"TOTAL_FLOOR_AREA",
|
|
"FLOOR_HEIGHT",
|
|
"FIXED_LIGHTING_OUTLETS_COUNT",
|
|
]
|
|
|
|
COLUMNS_TO_MERGE_ON = [
|
|
"PROPERTY_TYPE",
|
|
"BUILT_FORM",
|
|
"CONSTRUCTION_AGE_BAND",
|
|
"NUMBER_HABITABLE_ROOMS",
|
|
"NUMBER_HEATED_ROOMS",
|
|
]
|
|
|
|
FULLY_GLAZED_DESCRIPTIONS = [
|
|
"Fully double glazed",
|
|
"High performance glazing",
|
|
"Fully triple glazed",
|
|
"Full secondary glazing",
|
|
"Multiple glazing throughout",
|
|
]
|
|
|
|
FIXED_FEATURES = [
|
|
"PROPERTY_TYPE",
|
|
"BUILT_FORM",
|
|
"CONSTRUCTION_AGE_BAND",
|
|
"NUMBER_HABITABLE_ROOMS",
|
|
"CONSTITUENCY",
|
|
"NUMBER_HEATED_ROOMS",
|
|
"FIXED_LIGHTING_OUTLETS_COUNT",
|
|
]
|
|
|
|
COMPONENT_FEATURES = [
|
|
"TRANSACTION_TYPE",
|
|
"WALLS_DESCRIPTION",
|
|
"FLOOR_DESCRIPTION",
|
|
"LIGHTING_DESCRIPTION",
|
|
"ROOF_DESCRIPTION",
|
|
"MAINHEAT_DESCRIPTION",
|
|
"HOTWATER_DESCRIPTION",
|
|
"MAIN_FUEL",
|
|
"MECHANICAL_VENTILATION",
|
|
"SECONDHEAT_DESCRIPTION",
|
|
"ENERGY_TARIFF", # Not sure if this is relevant
|
|
"SOLAR_WATER_HEATING_FLAG",
|
|
"PHOTO_SUPPLY",
|
|
"WINDOWS_DESCRIPTION",
|
|
"GLAZED_TYPE",
|
|
"MULTI_GLAZE_PROPORTION",
|
|
"LOW_ENERGY_LIGHTING",
|
|
"NUMBER_OPEN_FIREPLACES",
|
|
"MAINHEATCONT_DESCRIPTION",
|
|
"EXTENSION_COUNT",
|
|
"TOTAL_FLOOR_AREA",
|
|
"FLOOR_HEIGHT",
|
|
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
|
]
|
|
|
|
# For these fields, we take the latest value if we have multiple values
|
|
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
|
# the most accurate
|
|
LATEST_FIELD = [
|
|
"NUMBER_HABITABLE_ROOMS",
|
|
"NUMBER_HEATED_ROOMS",
|
|
"FIXED_LIGHTING_OUTLETS_COUNT",
|
|
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
|
]
|
|
|
|
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
|
MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"]
|
|
|
|
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
|
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
|
# and Wales from 31 July 2014
|
|
EARLIEST_EPC_DATE = "2014-08-01"
|
|
|
|
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
|
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
|
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
|
|
|
|
|
def ordinal(n):
|
|
if 10 <= n % 100 <= 20:
|
|
suffix = "th"
|
|
else:
|
|
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
|
|
|
|
return str(n) + suffix
|
|
|
|
|
|
FLOOR_LEVEL_MAP = {
|
|
"Basement": -1,
|
|
"Ground": 0,
|
|
"ground floor": 0,
|
|
"20+": 20,
|
|
"21st or above": 21,
|
|
**{str(i).zfill(2): i for i in range(0, 21)},
|
|
**{ordinal(i): i for i in range(-1, 21)},
|
|
**{str(i): i for i in range(-1, 21)},
|
|
**{i: i for i in range(-1, 21)},
|
|
}
|
|
|
|
BUILT_FORM_REMAP = {
|
|
"Enclosed End-Terrace": "End-Terrace",
|
|
"Enclosed Mid-Terrace": "Mid-Terrace",
|
|
}
|
|
|
|
DATA_PROCESSOR_SETTINGS = {
|
|
"low_memory": False,
|
|
"epc_minimum_count": 1,
|
|
"column_mappings": {"UPRN": [int, str]},
|
|
}
|
|
|
|
# This has a manual mapping of the column types required
|
|
COLUMNTYPES = {
|
|
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
|
|
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
|
|
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64',
|
|
'CONSTRUCTION_AGE_BAND': 'object',
|
|
'TRANSACTION_TYPE': 'object',
|
|
'WALLS_DESCRIPTION': 'object',
|
|
'FLOOR_DESCRIPTION': 'object',
|
|
'LIGHTING_DESCRIPTION': 'object',
|
|
'ROOF_DESCRIPTION': 'object',
|
|
'MAINHEAT_DESCRIPTION': 'object',
|
|
'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
|
|
'MECHANICAL_VENTILATION': 'object',
|
|
'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
|
|
'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
|
|
'WINDOWS_DESCRIPTION': 'object',
|
|
'GLAZED_TYPE': 'object',
|
|
'MULTI_GLAZE_PROPORTION': 'float64',
|
|
'LOW_ENERGY_LIGHTING': 'float64',
|
|
'NUMBER_OPEN_FIREPLACES': 'float64',
|
|
'MAINHEATCONT_DESCRIPTION': 'object',
|
|
'EXTENSION_COUNT': 'float64',
|
|
'LODGEMENT_DATE': 'object',
|
|
}
|
|
|
|
# For modelling, we don't allow records with more than 100 SAP points
|
|
MAX_SAP_SCORE = 100
|
|
|
|
fill_na_map = {
|
|
# There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from
|
|
# a community network" that could be clustered with unknown fuel
|
|
"MAIN_FUEL": "UNKNOWN",
|
|
"MECHANICAL_VENTILATION": "Unknown",
|
|
"SECONDHEAT_DESCRIPTION": "None",
|
|
"ENERGY_TARIFF": "Unknown",
|
|
# We set solar water heating flag to N - we could investigate using a different category entirely
|
|
"SOLAR_WATER_HEATING_FLAG": "N",
|
|
"GLAZED_TYPE": "not defined",
|
|
"MULTI_GLAZE_PROPORTION": 0,
|
|
"LOW_ENERGY_LIGHTING": 0,
|
|
"MAINHEATCONT_DESCRIPTION": "Unknown",
|
|
"EXTENSION_COUNT": 0,
|
|
"NUMBER_OPEN_FIREPLACES": 0
|
|
}
|
|
|
|
# After the property descriptions have been re-remapped, we expect these features to be fixed
|
|
FIXED_DESCRIPTON_MAPPED_FEATURES = [
|
|
'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
|
|
'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
|
|
'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
|
|
'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
|
|
]
|