diff --git a/backend/Property.py b/backend/Property.py index af568b86..4bd77ec8 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -2,10 +2,10 @@ from datetime import datetime import re from epc_api.client import EpcClient from model_data.config import EPC_AUTH_TOKEN -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions -class Property(BaseUtility): +class Property(Definitions): ATTRIBUTE_MAP = { "floor-description": "floor", "hotwater-description": "hotwater", diff --git a/backend/app/db/functions/recommendations_functions.py b/backend/app/db/functions/recommendations_functions.py index 84f2f967..1d86a925 100644 --- a/backend/app/db/functions/recommendations_functions.py +++ b/backend/app/db/functions/recommendations_functions.py @@ -96,6 +96,9 @@ def upload_recommendations(session, recommendations_to_upload, property_id): "recommendation_id": recommendation_id, "material_id": part["id"], "depth": part["depths"][0] if part["depths"] else None, + "quantity": part["quantity"], + "quantity_unit": part["quantity_unit"], + "estimated_cost": part["estimated_cost"], } for rec, recommendation_id in zip(recommendations_to_upload, uploaded_recommendation_ids) for part in rec["parts"] diff --git a/backend/app/db/models/recommendations.py b/backend/app/db/models/recommendations.py index 60325562..5515b90d 100644 --- a/backend/app/db/models/recommendations.py +++ b/backend/app/db/models/recommendations.py @@ -1,8 +1,9 @@ -from sqlalchemy import Column, BigInteger, String, Float, Boolean, TIMESTAMP, ForeignKey -from sqlalchemy.orm import declarative_base, relationship +from sqlalchemy import Column, BigInteger, String, Float, Boolean, TIMESTAMP, ForeignKey, Enum +from sqlalchemy.orm import declarative_base from sqlalchemy.sql import func from backend.app.db.models.portfolio import Portfolio, PropertyModel from backend.app.db.models.materials import Material +from datatypes.enums import QuantityUnits Base = declarative_base() @@ -37,6 +38,9 @@ class RecommendationMaterials(Base): material_id = Column(BigInteger, ForeignKey(Material.id), nullable=False) created_at = Column(TIMESTAMP, nullable=False, server_default=func.now()) depth = Column(Float, nullable=False) + quantity = Column(Float, nullable=False) + quantity_unit = Column(Enum(QuantityUnits, values_callable=lambda x: [e.value for e in x]), nullable=False) + estimated_cost = Column(Float, nullable=False) class Plan(Base): diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 70b1ad19..0f68794b 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -115,7 +115,7 @@ def insert_temp_recommendation_id(property_recommendations): Creates a temporary recommendation id which is needed for filtering recommendations between default and no, after the optimiser has been run - :param property_recommendations: nested list of recommendations, grouped by types + :param property_recommendations: nested list of recommendations, grouped by data_types :return: Updated recommendations_to_upload, where where recommendation has a "recommendation_id" integer inserted """ diff --git a/datatypes/enums.py b/datatypes/enums.py new file mode 100644 index 00000000..1b0959e0 --- /dev/null +++ b/datatypes/enums.py @@ -0,0 +1,5 @@ +import enum + + +class QuantityUnits(enum.Enum): + m2 = "m2" diff --git a/model_data/BaseUtility.py b/model_data/BaseUtility.py index 6337f26a..beece742 100644 --- a/model_data/BaseUtility.py +++ b/model_data/BaseUtility.py @@ -1,4 +1,4 @@ -class BaseUtility: +class Definitions: """ This class contains some base attributes which are used across multiple other classes """ @@ -38,7 +38,7 @@ class BaseUtility: # addresses will take time to develop to deal with these and future anomalies. # # There are several fields within the lodged data where it is possible to enter multiple entries to cater for - # different types of build within a single property, i.e. extensions. This results in multiple entries for + # different data_types of build within a single property, i.e. extensions. This results in multiple entries for # the description fields for floor, roof and wall. For the purposes of this data release only the information # contained within the first of these multiple entries is being provided. As there are no restrictions on the # value in this first field it means that sometimes the first field in a multiple entry description field may diff --git a/model_data/app.py b/model_data/app.py index 6ccc956c..e6761121 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -22,7 +22,7 @@ LAND_REGISTRY_PATHS = [ def app(): """ - For a pre-defined list of constituencies and property types, we'll download EPC data from the API + For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API and produce a dataset of cleaned fields so that when we get new properties, we can quickly sanitise any description data :return: diff --git a/model_data/epc_attributes/FloorAttributes.py b/model_data/epc_attributes/FloorAttributes.py index 0d8ea493..024ec6dc 100644 --- a/model_data/epc_attributes/FloorAttributes.py +++ b/model_data/epc_attributes/FloorAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import extract_thermal_transmittance, extract_component_types -class FloorAttributes(BaseUtility): +class FloorAttributes(Definitions): DWELLING_BELOW = ["another dwelling below", "other premises below"] FLOOR_TYPES = ["assumed", "to unheated space", "to external air", "suspended", "solid"] diff --git a/model_data/epc_attributes/HotWaterAttributes.py b/model_data/epc_attributes/HotWaterAttributes.py index 79dfe62d..97664416 100644 --- a/model_data/epc_attributes/HotWaterAttributes.py +++ b/model_data/epc_attributes/HotWaterAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import clean_description, find_keyword -class HotWaterAttributes(BaseUtility): +class HotWaterAttributes(Definitions): # HEATER_TYPES refer to the main devices used for heating water. These devices can be powered by different energy # sources. HEATER_TYPES = [ diff --git a/model_data/epc_attributes/MainFuelAttributes.py b/model_data/epc_attributes/MainFuelAttributes.py index 99ad8ed8..055f4cac 100644 --- a/model_data/epc_attributes/MainFuelAttributes.py +++ b/model_data/epc_attributes/MainFuelAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import clean_description, remove_punctuation, find_keyword -class MainFuelAttributes(BaseUtility): +class MainFuelAttributes(Definitions): FUEL_KEYWORDS = [ 'heat network', 'mains gas', @@ -96,7 +96,7 @@ class MainFuelAttributes(BaseUtility): if not result["fuel_type"]: result["fuel_type"] = self.UNKNOWN_FUEL - # We'll do checks on unknown fuel types to ensure we don't miss anything + # We'll do checks on unknown fuel data_types to ensure we don't miss anything self.is_unknown = True return result diff --git a/model_data/epc_attributes/MainheatAttributes.py b/model_data/epc_attributes/MainheatAttributes.py index 727ce0e6..492c3123 100644 --- a/model_data/epc_attributes/MainheatAttributes.py +++ b/model_data/epc_attributes/MainheatAttributes.py @@ -1,9 +1,9 @@ -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import clean_description, process_part from typing import Dict, Union -class MainHeatAttributes(BaseUtility): +class MainHeatAttributes(Definitions): HEAT_SYSTEMS = [ "boiler", "air source heat pump", "room heaters", "electric storage heaters", "warm air", "electric underfloor heating", "electric ceiling heating", "community scheme", diff --git a/model_data/epc_attributes/MainheatControlAttributes.py b/model_data/epc_attributes/MainheatControlAttributes.py index 2cd4f68d..e1c3ed4f 100644 --- a/model_data/epc_attributes/MainheatControlAttributes.py +++ b/model_data/epc_attributes/MainheatControlAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import clean_description, find_keyword -class MainheatControlAttributes(BaseUtility): +class MainheatControlAttributes(Definitions): # These systems allow for the automatic regulation of temperature THERMOSTATIC_CONTROL_KEYWORDS = [ 'room thermostats', diff --git a/model_data/epc_attributes/RoofAttributes.py b/model_data/epc_attributes/RoofAttributes.py index 542978a4..df1ce977 100644 --- a/model_data/epc_attributes/RoofAttributes.py +++ b/model_data/epc_attributes/RoofAttributes.py @@ -1,10 +1,10 @@ import re from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance -class RoofAttributes(BaseUtility): +class RoofAttributes(Definitions): ROOF_TYPES = ['pitched', 'roof room', 'loft', 'flat', 'thatched', 'at rafters', 'assumed'] DWELLING_ABOVE = ["another dwelling above", "other premises above"] diff --git a/model_data/epc_attributes/WallAttributes.py b/model_data/epc_attributes/WallAttributes.py index 886d2956..a0601029 100644 --- a/model_data/epc_attributes/WallAttributes.py +++ b/model_data/epc_attributes/WallAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import extract_component_types, extract_thermal_transmittance -class WallAttributes(BaseUtility): +class WallAttributes(Definitions): WALL_TYPES = ['cavity wall', 'filled cavity', 'solid brick', 'system built', 'timber frame', 'granite or whinstone', 'as built', 'cob', 'assumed', 'sandstone or limestone'] diff --git a/model_data/epc_attributes/WindowAttributes.py b/model_data/epc_attributes/WindowAttributes.py index 836d3dd0..a0985870 100644 --- a/model_data/epc_attributes/WindowAttributes.py +++ b/model_data/epc_attributes/WindowAttributes.py @@ -1,9 +1,9 @@ from typing import Dict, Union -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from model_data.epc_attributes.attribute_utils import clean_description -class WindowAttributes(BaseUtility): +class WindowAttributes(Definitions): GLAZING_KEYWORDS = ["glazing", "glazed", "glaze"] GLAZING_COVERAGE = ["fully", "mostly", "partial", "some", "full", "thoughout"] GLAZING_TYPES = ["double", "triple", "secondary", "multiple", "high performance", "single"] diff --git a/model_data/epc_attributes/attribute_utils.py b/model_data/epc_attributes/attribute_utils.py index b7140ab1..9819cc01 100644 --- a/model_data/epc_attributes/attribute_utils.py +++ b/model_data/epc_attributes/attribute_utils.py @@ -36,13 +36,13 @@ def extract_component_types(result: dict, description: str, list_of_components: Dict[str, Union[None, str, float]], str ]: """ - Extracts component types from the description, updates the result dictionary, and removes the matched component - types from the description. + Extracts component data_types from the description, updates the result dictionary, and removes the matched component + data_types from the description. :param result: Dictionary to store the results in. :param description: Lowercase description string. - :param list_of_components: List of component types to extract from the description. - :return: A tuple containing the updated result dictionary and the description with the matched component types + :param list_of_components: List of component data_types to extract from the description. + :return: A tuple containing the updated result dictionary and the description with the matched component data_types removed. """ for component in list_of_components: diff --git a/model_data/simulation_system/DataProcessor.py b/model_data/simulation_system/DataProcessor.py index 477883c4..50abd8e3 100644 --- a/model_data/simulation_system/DataProcessor.py +++ b/model_data/simulation_system/DataProcessor.py @@ -1,7 +1,7 @@ from pathlib import Path import numpy as np import pandas as pd -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from simulation_system.Settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, @@ -12,7 +12,7 @@ from simulation_system.Settings import ( FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, COLUMNS_TO_MERGE_ON - ) +) from typing import List @@ -43,11 +43,11 @@ class DataProcessor: if DATA_PROCESSOR_SETTINGS['epc_minimum_count'] >= 1: # If we have multiple EPC records, we can try and do filling self.fill_na_fields() - + self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) return self.data - + def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON): """ If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields @@ -56,35 +56,33 @@ class DataProcessor: # The groupby changes the order and we use the index to make the original data filled_data = self.data.groupby("UPRN", group_keys=True)[columns_to_fill].apply( lambda group: group.fillna(method='bfill').fillna(method='ffill') - ).reset_index().set_index('level_1').sort_index() + ).reset_index().set_index('level_1').sort_index() + + self.data[columns_to_fill] = filled_data[columns_to_fill] - self.data[columns_to_fill] = filled_data[columns_to_fill] - - def remap_columns(self): """ Remap all columns, for any non values """ # Map all anomaly values to None - data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) - + data_anomaly_map = dict(zip(Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES))) + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values data = self.data.replace(data_anomaly_map) data = data.replace(np.NAN, None) - + # Remap certain columns data['FLOOR_LEVEL'] = data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) data['BUILT_FROM'] = data['BUILT_FORM'].replace(BUILT_FORM_REMAP) self.data = data - def make_cleaning_averages(self) -> pd.DataFrame: # Define a custom function to calculate the median, excluding missing values def median_without_missing(group): return group[AVERAGE_FIXED_FEATURES].median(skipna=True) - + cleaning_averages = self.data.groupby( ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"], observed=True, @@ -93,41 +91,58 @@ class DataProcessor: general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply( median_without_missing).reset_index() - + property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply( median_without_missing).reset_index() - + built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply( median_without_missing).reset_index() - + # We can clean up any NA's in the cleaning averages with the general averages here - cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE']) - cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE']) - cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE']) + cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], + suffixes=['', '_AVERAGE']) + cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], + suffixes=['', '_PROPERTY_AVERAGE']) + cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], + suffixes=['', '_BUILT_FORM_AVERAGE']) # Replace any missing NAN values with averages for the same Property type and built form - cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE']) - cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE']) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna( + cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna( + cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) - # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form + # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope + # and built form # We can use just the property type average and replace - cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE']) - cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE']) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE']) + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna( + cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna( + cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE']) # If there are still NA values, use BUILT FORM averages - cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE']) - cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) - cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna( + cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE']) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna( + cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) + cleaning_averages_filled = cleaning_averages_filled.drop( + columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE']) # If there still is na values, use average across all properties in consituecy - cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean()) - cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean()) + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna( + cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean()) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna( + cleaning_averages_filled['FLOOR_HEIGHT'].mean()) # If the consituency is all NA values, then take UK AVERAGE VALUES - cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE) - cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE) + cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna( + TOTAL_FLOOR_AREA_NATIONAL_AVERAGE) + cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna( + FLOOR_HEIGHT_NATIONAL_AVERAGE) return cleaning_averages_filled @@ -143,7 +158,6 @@ class DataProcessor: counts = counts[counts["count"] > epc_minimum_count] self.data = pd.merge(self.data, counts, on='UPRN') - def recast_df_columns(self, column_mappings: dict) -> None: """ Recast columns from the dataframe to ensure the behaviour we want @@ -156,7 +170,6 @@ class DataProcessor: for value in values: self.data[key] = self.data[key].astype(value) - def confine_data(self) -> None: """ Include all step to reduce down the data based on assumptions @@ -177,12 +190,11 @@ class DataProcessor: self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"] self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] - def clean_multi_glaze_proportion(self) -> None: """ If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100 """ - no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & ( + self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 - diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index 517460b0..9ac2c13d 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -1,13 +1,13 @@ import numpy as np import pandas as pd from tqdm import tqdm -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from pathlib import Path from model_data.simulation_system.Settings import ( MANDATORY_FIXED_FEATURES, - AVERAGE_FIXED_FEATURES, - LATEST_FIELD, - COMPONENT_FEATURES, + AVERAGE_FIXED_FEATURES, + LATEST_FIELD, + COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, @@ -18,6 +18,7 @@ from DataProcessor import DataProcessor DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' + def app(): # Get all the files in the directory @@ -30,8 +31,9 @@ def app(): dataset = [] # 116 # 128048706 - # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic-certificates/domestic-E09000021-Kingston-upon-Thames') - for directory in tqdm(directories): + # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic + # -certificates/domestic-E09000021-Kingston-upon-Thames') + for directory in tqdm(directories): filepath = directory / "certificates.csv" @@ -45,7 +47,7 @@ def app(): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} - # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row + # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: continue @@ -61,16 +63,21 @@ def app(): cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list() # Get the corresponding groupby and merge, and fill in NA values - cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() - - modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE']) - modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) - modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE']) - modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) + cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[ + ['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() + + modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, + suffixes=['', '_AVERAGE']) + modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna( + modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) + modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna( + modified_property_data['FLOOR_HEIGHT_AVERAGE']) + modified_property_data = modified_property_data.drop( + columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) for field in AVERAGE_FIXED_FEATURES: - vals = list(modified_property_data[field].dropna().unique()) + vals = list(modified_property_data[field].dropna().unique()) if len(vals) > 1: # Check the values are too far apart # TODO: we could have multiple values here, why only use the first two? @@ -80,10 +87,10 @@ def app(): if len(vals) == 0: wrong_var - + fixed_data[field] = np.mean(vals) - #Combine all fields together + # Combine all fields together fixed_data.update(mandatory_field_data) fixed_data.update(latest_field_data) @@ -132,4 +139,4 @@ def app(): if __name__ == "__main__": - app() \ No newline at end of file + app() diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py index 4a361196..87ad3799 100644 --- a/model_data/simulation_system/energy_predictor.py +++ b/model_data/simulation_system/energy_predictor.py @@ -1,15 +1,15 @@ from pathlib import Path from Settings import ( - RDSAP_RESPONSE, - FLOOR_LEVEL_MAP, + RDSAP_RESPONSE, + FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, - EARLIEST_EPC_DATE, + EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES - ) -from model_data.BaseUtility import BaseUtility +) +from model_data.BaseUtility import Definitions from tqdm import tqdm import pandas as pd import numpy as np @@ -21,17 +21,18 @@ RANDOM_SEED = 0 DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' FLOAT_COLUMNS = [ - 'NUMBER_OPEN_FIREPLACES', - 'EXTENSION_COUNT', - 'TOTAL_FLOOR_AREA', - 'PHOTO_SUPPLY', - 'FIXED_LIGHTING_OUTLETS_COUNT', - 'FLOOR_HEIGHT', - 'NUMBER_HABITABLE_ROOMS', - 'LOW_ENERGY_LIGHTING', - 'MULTI_GLAZE_PROPORTION', - 'NUMBER_HEATED_ROOMS' - ] + 'NUMBER_OPEN_FIREPLACES', + 'EXTENSION_COUNT', + 'TOTAL_FLOOR_AREA', + 'PHOTO_SUPPLY', + 'FIXED_LIGHTING_OUTLETS_COUNT', + 'FLOOR_HEIGHT', + 'NUMBER_HABITABLE_ROOMS', + 'LOW_ENERGY_LIGHTING', + 'MULTI_GLAZE_PROPORTION', + 'NUMBER_HEATED_ROOMS' +] + def create_raw_data(): """ @@ -40,7 +41,7 @@ def create_raw_data(): directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] # directories = directories[0:10] - dfs = [] + dfs = [] for directory in tqdm(directories): filepath = directory / "certificates.csv" df = pd.read_csv(filepath, low_memory=False) @@ -52,7 +53,8 @@ def create_raw_data(): df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] # Change multi glaze proportion - no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & ( + df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 # Recast @@ -63,12 +65,12 @@ def create_raw_data(): df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) # Map all anomaly values to None - data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES))) - + data_anomaly_map = dict(zip(Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES))) + # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values df = df.replace(data_anomaly_map) df = df.replace(np.NAN, None) - + # Remap certain columns df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) @@ -83,7 +85,6 @@ def create_raw_data(): df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned') df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float') - dfs.append(df) data = pd.concat(dfs) @@ -95,23 +96,23 @@ def create_raw_data(): def main(): - data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet') - subsample_size = round(len(data)/100) + subsample_size = round(len(data) / 100) data = data.sample(subsample_size, random_state=RANDOM_SEED) predictor_RDSAP = TabularPredictor( - label=RDSAP_RESPONSE, - path="agModels-predictENERGY", + label=RDSAP_RESPONSE, + path="agModels-predictENERGY", problem_type="regression", eval_metric='mean_absolute_error' - ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) + ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet') performance = predictor_RDSAP.evaluate(test_data) predictions = predictor_RDSAP.predict(test_data) predictor_RDSAP.feature_importance(test_data) + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/model_data/tests/test_floor_attributes.py b/model_data/tests/test_floor_attributes.py index ce17aa3b..4b3b2c85 100644 --- a/model_data/tests/test_floor_attributes.py +++ b/model_data/tests/test_floor_attributes.py @@ -36,7 +36,7 @@ class TestCleanFloor: # Test that invalid descriptions raise a ValueError invalid_descriptions = [ "invalid description", - "description with no known floor types or thermal transmittance", + "description with no known floor data_types or thermal transmittance", ] for description in invalid_descriptions: diff --git a/model_data/tests/test_hotwater_attributes.py b/model_data/tests/test_hotwater_attributes.py index 040cc1ba..25cd8f40 100644 --- a/model_data/tests/test_hotwater_attributes.py +++ b/model_data/tests/test_hotwater_attributes.py @@ -29,7 +29,7 @@ class TestHotWaterAttributes: # Test that invalid descriptions raise a ValueError invalid_descriptions = [ "invalid description", - "description with no known hotwater types", + "description with no known hotwater data_types", "" ] diff --git a/model_data/tests/test_mainfuel_attributes.py b/model_data/tests/test_mainfuel_attributes.py index b67bf203..cf23cb9f 100644 --- a/model_data/tests/test_mainfuel_attributes.py +++ b/model_data/tests/test_mainfuel_attributes.py @@ -29,7 +29,7 @@ class TestMainHeatControlAttributes: # Test that invalid descriptions raise a ValueError invalid_descriptions = [ "invalid description", - "description with no known fuel types", + "description with no known fuel data_types", ] for description in invalid_descriptions: diff --git a/model_data/tests/test_mainheat_attributes.py b/model_data/tests/test_mainheat_attributes.py index 761618eb..a092945d 100644 --- a/model_data/tests/test_mainheat_attributes.py +++ b/model_data/tests/test_mainheat_attributes.py @@ -34,7 +34,7 @@ class TestMainHeatAttributes: invalid_descriptions = [ "", "invalid description", - "description with no known heating types", + "description with no known heating data_types", ] for description in invalid_descriptions: diff --git a/model_data/tests/test_mainheat_controls_attributes.py b/model_data/tests/test_mainheat_controls_attributes.py index 2eaa1822..afdde784 100644 --- a/model_data/tests/test_mainheat_controls_attributes.py +++ b/model_data/tests/test_mainheat_controls_attributes.py @@ -29,7 +29,7 @@ class TestMainHeatControlAttributes: # Test that invalid descriptions raise a ValueError invalid_descriptions = [ "invalid description", - "description with no known heating control types", + "description with no known heating control data_types", ] for description in invalid_descriptions: diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index 681d267d..3d53da69 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -1,6 +1,7 @@ import math from typing import List -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions +from datatypes.enums import QuantityUnits from backend.Property import Property from recommendations.rdsap_tables import default_wall_thickness, age_band_data from recommendations.recommendation_utils import ( @@ -13,7 +14,7 @@ suspended_floor_insulation_parts = [ # Example product # https://www.insulationsuperstore.co.uk/product/recticel-eurothane-general-purpose-pir-insulation-board-2400 # -x-1200-x-100mm.html - # All product types here: + # All product data_types here: # https://www.insulationsuperstore.co.uk/browse/insulation/brand/recticel/filterby/application/floors.html "type": "suspended_floor_insulation", "description": "Rigid Insulation Foam Boards", @@ -29,7 +30,7 @@ suspended_floor_insulation_parts = [ { # Example product # https://www.insulationsuperstore.co.uk/product/rockwool-rwa45-acoustic-insulation-slab-100mm-2-88m2-pack.html - # All product types here: + # All product data_types here: # https://www.insulationsuperstore.co.uk/browse/insulation/brand/rockwool/filterby/application/floors # /material/mineral-wool.html "type": "suspended_floor_insulation", @@ -49,7 +50,7 @@ solid_floor_insulation_parts = [ { # Example product # https://www.insulationexpress.co.uk/floor-insulation/solid-floor-insulation/k103-100mm - # All product types here: + # All product data_types here: # https://www.insulationexpress.co.uk/floor-insulation/solid-floor-insulation?brand=7015&p=1 # Example screed https://www.screwfix.com/p/mapei-ultraplan-3240-self-levelling-compound-25kg/4959f "type": "solid_floor_insulation", @@ -69,7 +70,7 @@ solid_floor_insulation_parts = [ parts = suspended_floor_insulation_parts + solid_floor_insulation_parts -class FloorRecommendations(BaseUtility): +class FloorRecommendations(Definitions): # part L building regulations indicate that any rennovations on an existing property's walls should # achieve a U-value of no higher than 0.3 BUILDING_REGULATIONS_PART_L_MAX_U_VALUE = 0.25 @@ -305,17 +306,25 @@ class FloorRecommendations(BaseUtility): if new_u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: lowest_selected_u_value = update_lowest_selected_u_value(lowest_selected_u_value, new_u_value) + estimated_cost = cost_per_unit * self.property.floor_area + self.recommendations.append( { "parts": [ - get_recommended_part(part, depth, cost_per_unit), + get_recommended_part( + part=part, + selected_depth=depth, + quantity=self.property.floor_area, + quantity_unit=QuantityUnits.m2.value, + selected_total_cost=estimated_cost + ), ], "type": "floor_insulation", "description": self._make_floor_description(part, depth), "starting_u_value": u_value, "new_u_value": new_u_value, "sap_points": estimate_sap_points(), - "cost": cost_per_unit * self.property.floor_area, + "cost": estimated_cost, } ) diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index fc8f3c7b..fdd271be 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -1,8 +1,9 @@ import itertools import math +from datatypes.enums import QuantityUnits from backend.Property import Property -from model_data.BaseUtility import BaseUtility +from model_data.BaseUtility import Definitions from recommendations.recommendation_utils import ( r_value_per_mm_to_u_value, calculate_u_value_uplift, is_diminishing_returns, update_lowest_selected_u_value, get_recommended_part, get_uvalue_estimate, estimate_sap_points @@ -184,7 +185,7 @@ internal_wall_insulation_parts = [ wall_parts = external_wall_insulation_parts + internal_wall_insulation_parts -class WallRecommendations(BaseUtility): +class WallRecommendations(Definitions): YEAR_WALLS_BUILT_WITH_INSULATION = 1990 # After 1930, Solid brick walls became less populate and instead, cavity walls became a # more popular choice @@ -332,15 +333,25 @@ class WallRecommendations(BaseUtility): if new_u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: lowest_selected_u_value = update_lowest_selected_u_value(lowest_selected_u_value, new_u_value) + estimated_cost = cost_per_unit * self.property.insulation_wall_area + recommendations.append( { - "parts": [get_recommended_part(part, depth, cost_per_unit)], + "parts": [ + get_recommended_part( + part=part, + selected_depth=depth, + quantity=self.property.insulation_wall_area, + quantity_unit=QuantityUnits.m2.value, + selected_total_cost=estimated_cost + ) + ], "type": "wall_insulation", "description": "Install " + self._make_description(part, depth), "starting_u_value": u_value, "new_u_value": new_u_value, "sap_points": estimate_sap_points(), - "cost": cost_per_unit * self.property.insulation_wall_area, + "cost": estimated_cost, } ) @@ -394,10 +405,25 @@ class WallRecommendations(BaseUtility): if combined_new_u_value - self.U_VALUE_ERROR <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: # Here you might want to define a way to add both recommendations together. # For now, I'm adding them as separate items in the list + ewi_esimtated_cost = ewi_cost_per_unit * self.property.insulation_wall_area + iwi_esimtated_cost = iwi_cost_per_unit * self.property.insulation_wall_area + recommendation = { "parts": [ - get_recommended_part(ewi_part, ewi_depth, ewi_cost_per_unit), - get_recommended_part(iwi_part, iwi_depth, iwi_cost_per_unit) + get_recommended_part( + part=ewi_part, + selected_depth=ewi_depth, + quantity=self.property.insulation_wall_area, + quantity_unit=QuantityUnits.m2.value, + selected_total_cost=ewi_esimtated_cost + ), + get_recommended_part( + part=iwi_part, + selected_depth=iwi_depth, + quantity=self.property.insulation_wall_area, + quantity_unit=QuantityUnits.m2.value, + selected_total_cost=iwi_esimtated_cost + ) ], "type": "wall_insulation", "description": ( @@ -407,10 +433,7 @@ class WallRecommendations(BaseUtility): "starting_u_value": u_value, "new_u_value": combined_new_u_value, "sap_points": estimate_sap_points(), - "cost": ( - ewi_cost_per_unit * self.property.insulation_wall_area + iwi_cost_per_unit * - self.property.insulation_wall_area - ), + "cost": ewi_esimtated_cost + iwi_esimtated_cost, } self.recommendations.append(recommendation) diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 5bbcbe9f..9b7dbd4e 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -110,17 +110,21 @@ def update_lowest_selected_u_value(lowest_selected_u_value, new_u_value): return lowest_selected_u_value -def get_recommended_part(part, selected_depth, selected_cost): +def get_recommended_part(part, selected_depth, selected_total_cost, quantity, quantity_unit): """ Utility function to return a recommended part with the selected depth. :param part: part to be recommended :param selected_depth: depth of the selected part - :param selected_cost: cost of the selected depth + :param selected_total_cost: Total cost of the selected part + :param quantity: Quantity of the selected part + :param quantity_unit: Unit of the quantity :return: """ recommended_part = deepcopy(part) recommended_part["depths"] = [selected_depth] - recommended_part["cost"] = [selected_cost] + recommended_part["estimated_cost"] = selected_total_cost + recommended_part["quantity"] = quantity + recommended_part["quantity_unit"] = quantity_unit return recommended_part diff --git a/serverless.yml b/serverless.yml index c1b25d76..3a01acb0 100644 --- a/serverless.yml +++ b/serverless.yml @@ -46,6 +46,7 @@ package: - 'model_data/EpcClean.py' - 'model_data/utils.py' - 'model_data/epc_attributes/**' + - 'datatypes/**' - '!infrastructure/**' - '!data_collection/**' - '!node_modules/**'