diff --git a/.idea/Model.iml b/.idea/Model.iml index 0b8ab409..1e51ede4 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index fb10c6b0..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 98db7b88..76a906a9 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -864,6 +864,19 @@ async def model_engine(body: PlanTriggerRequest): "epc_page_rrn": epc_page_source.get("rrn"), }) + # TODO - delete me (finding the entries in epc data that are not in epc record + example_property = input_properties[0] + example_epc_record = example_property.epc_record + example_epc_data = example_property.data + + epc_record_dir = dir(example_epc_record) + + missed_keys = [] + for k in example_epc_data.keys(): + k_replaced = k.replace("-", "_") + if k_replaced not in epc_record_dir: + missed_keys.append(k_replaced) + if not input_properties: return Response(status_code=204) diff --git a/etl/epc/Record.py b/etl/epc/Record.py index e1853361..73f09bb9 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -1,3 +1,5 @@ +from typing import Optional, get_origin, get_args, Union +from dataclasses import fields from datetime import datetime from dataclasses import dataclass from etl.epc.ValidationConfiguration import ( @@ -24,8 +26,11 @@ from etl.epc.settings import ( ) from recommendations.recommendation_utils import estimate_number_of_floors from utils.s3 import read_dataframe_from_s3_parquet +from utils.logger import setup_logger from etl.epc.settings import EARLIEST_EPC_DATE +logger = setup_logger() + # TODO: Change these in the settings file RDSAP_RESPONSE = RDSAP_RESPONSE.lower() HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower() @@ -47,60 +52,179 @@ class EPCRecord: Base class for a EPC record """ - uprn: int = None - walls_description: str = None - floor_description: str = None - lighting_description: str = None - roof_description: str = None - mainheat_description: str = None - hotwater_description: str = None - main_fuel: str = None - mechanical_ventilation: str = None - secondheat_description: str = None - windows_description: str = None - glazed_type: str = None - multi_glaze_proportion: float = None - low_energy_lighting: float = None - number_open_fireplaces: float = None - mainheatcont_description: str = None - solar_water_heating_flag: str = None - photo_supply: float = None - transaction_type: str = None - energy_tariff: str = None - extension_count: float = None - total_floor_area: float = None - floor_height: float = None - hot_water_energy_eff: str = None - floor_energy_eff: str = None - windows_energy_eff: str = None - walls_energy_eff: str = None - sheating_energy_eff: str = None - roof_energy_eff: str = None - mainheat_energy_eff: str = None - mainheatc_energy_eff: str = None - lighting_energy_eff: str = None - lighting_cost_current: float = None - heating_cost_current: float = None - hot_water_cost_current: float = None - potential_energy_efficiency: float = None - environment_impact_potential: float = None - energy_consumption_potential: float = None - co2_emissions_potential: float = None - lodgement_date: str = None - current_energy_efficiency: int = None - energy_consumption_current: int = None - co2_emissions_current: float = None - number_habitable_rooms: float = None - number_heated_rooms: float = None - is_post_sap10: bool = None + # ------------------------------------------------------------------ + # IDENTIFIERS / METADATA + # ------------------------------------------------------------------ - # u_values_walls = None - # u_values_roof = None - # u_values_floor = None + uprn: Optional[int] = None + lmk_key: Optional[str] = None + building_reference_number: Optional[str] = None + report_type: Optional[str] = None + transaction_type: Optional[str] = None + uprn_source: Optional[str] = None + + lodgement_date: Optional[str] = None + lodgement_datetime: Optional[str] = None + inspection_date: Optional[str] = None + + # ------------------------------------------------------------------ + # ADDRESS / LOCATION DATA + # ------------------------------------------------------------------ + + address: Optional[str] = None + address1: Optional[str] = None + address2: Optional[str] = None + address3: Optional[str] = None + + postcode: Optional[str] = None + posttown: Optional[str] = None + county: Optional[str] = None + + local_authority: Optional[str] = None + local_authority_label: Optional[str] = None + constituency: Optional[str] = None + constituency_label: Optional[str] = None + + # ------------------------------------------------------------------ + # PROPERTY CHARACTERISTICS + # ------------------------------------------------------------------ + + property_type: Optional[str] = None + built_form: Optional[str] = None + tenure: Optional[str] = None + floor_level: Optional[str] = None + flat_top_storey: Optional[str] = None + flat_storey_count: Optional[int] = None + + glazed_area: Optional[str] = None + heat_loss_corridor: Optional[str] = None + unheated_corridor_length: Optional[float] = None + + mains_gas_flag: Optional[str] = None + + # ------------------------------------------------------------------ + # BUILDING FABRIC DESCRIPTIONS + # ------------------------------------------------------------------ + + walls_description: Optional[str] = None + floor_description: Optional[str] = None + roof_description: Optional[str] = None + windows_description: Optional[str] = None + + walls_env_eff: Optional[str] = None + floor_env_eff: Optional[str] = None + roof_env_eff: Optional[str] = None + windows_env_eff: Optional[str] = None + mainheat_env_eff: Optional[str] = None + sheating_env_eff: Optional[str] = None + hot_water_env_eff: Optional[str] = None + mainheatc_env_eff: Optional[str] = None + + walls_energy_eff: Optional[str] = None + floor_energy_eff: Optional[str] = None + roof_energy_eff: Optional[str] = None + windows_energy_eff: Optional[str] = None + hot_water_energy_eff: Optional[str] = None + sheating_energy_eff: Optional[str] = None + mainheat_energy_eff: Optional[str] = None + mainheatc_energy_eff: Optional[str] = None + + # ------------------------------------------------------------------ + # HEATING / HOT WATER / SYSTEMS + # ------------------------------------------------------------------ + + mainheat_description: Optional[str] = None + mainheatcont_description: Optional[str] = None + secondheat_description: Optional[str] = None + hotwater_description: Optional[str] = None + main_fuel: Optional[str] = None + main_heating_controls: Optional[str] = None + + mechanical_ventilation: Optional[str] = None + + solar_water_heating_flag: Optional[str] = None + wind_turbine_count: Optional[int] = None + photo_supply: Optional[float] = None + + # ------------------------------------------------------------------ + # LIGHTING + # ------------------------------------------------------------------ + + lighting_description: Optional[str] = None + lighting_env_eff: Optional[str] = None + lighting_energy_eff: Optional[str] = None + + low_energy_lighting: Optional[float] = None + fixed_lighting_outlets_count: Optional[int] = None + low_energy_fixed_light_count: Optional[int] = None + + # ------------------------------------------------------------------ + # ENERGY RATINGS + # ------------------------------------------------------------------ + + current_energy_rating: Optional[str] = None + potential_energy_rating: Optional[str] = None + + current_energy_efficiency: Optional[int] = None + potential_energy_efficiency: Optional[float] = None + + # ------------------------------------------------------------------ + # ENERGY / CARBON METRICS + # ------------------------------------------------------------------ + + energy_consumption_current: Optional[int] = None + energy_consumption_potential: Optional[float] = None + + co2_emissions_current: Optional[float] = None + co2_emissions_potential: Optional[float] = None + + co2_emiss_curr_per_floor_area: Optional[float] = None + + environment_impact_current: Optional[int] = None + environment_impact_potential: Optional[float] = None + + # ------------------------------------------------------------------ + # COST METRICS + # ------------------------------------------------------------------ + + heating_cost_current: Optional[float] = None + lighting_cost_current: Optional[float] = None + hot_water_cost_current: Optional[float] = None + + heating_cost_potential: Optional[float] = None + lighting_cost_potential: Optional[float] = None + hot_water_cost_potential: Optional[float] = None + + energy_tariff: Optional[str] = None + + # ------------------------------------------------------------------ + # PROPERTY DIMENSIONS / COUNTS + # ------------------------------------------------------------------ + + total_floor_area: Optional[float] = None + floor_height: Optional[float] = None + + number_habitable_rooms: Optional[float] = None + number_heated_rooms: Optional[float] = None + number_open_fireplaces: Optional[float] = None + + extension_count: Optional[float] = None + + # ------------------------------------------------------------------ + # GLAZING + # ------------------------------------------------------------------ + + glazed_type: Optional[str] = None + multi_glaze_proportion: Optional[float] = None + + # ------------------------------------------------------------------ + # MODEL FLAGS + # ------------------------------------------------------------------ + + is_post_sap10: Optional[bool] = None run_mode: str = "training" - # TODO: Make this a class so thet api_records is structured epc_records: dict = None full_sap_epc: dict = None old_data: list[dict] = None @@ -146,20 +270,8 @@ class EPCRecord: self._expand_prepared_epc_to_attributes() self._identify_delta_between_prepared_and_original_records() - # Process to create uvalues for the single epc record - # self.df = self.epc_record_as_dataframe('prepared_epc') - # self._feature_generation() - # self._drop_features() - return - # self._expand_description_to_features() - # self._expand_description_to_uvalues() - # - # self._generate_uvalues() - # self._validate_expanded_description() - # self._validate_u_values() - def _drop_features(self): """ Drop features that are not needed for modelling @@ -200,88 +312,59 @@ class EPCRecord: self.prepared_epc = epc_data_processor.data.to_dict(orient="records")[0] + def _cast_value(self, value, type_hint): + + origin = get_origin(type_hint) + args = get_args(type_hint) + + if origin is Union: + type_hint = [a for a in args if a is not type(None)][0] + + if type_hint is int: + return int(value) + + if type_hint is float: + return float(value) + + if type_hint is bool: + if isinstance(value, bool): + return value + return str(value).lower() in ["true", "1", "y", "yes"] + + if type_hint is str: + return str(value) + + return value + def _expand_prepared_epc_to_attributes(self): """ - This method will expand the prepared epc to attributes + Expand prepared_epc dictionary into dataclass attributes. + Assumes prepared_epc keys are snake_case. """ - # for key, value in self.prepared_epc.items(): - # setattr(self, key, value) + field_map = {f.name: f for f in fields(self)} - self.uprn: int = int(self.prepared_epc["uprn"]) - self.walls_description: str = self.prepared_epc["walls_description"] - self.floor_description: str = self.prepared_epc["floor_description"] - self.lighting_description: str = self.prepared_epc["lighting_description"] - self.roof_description: str = self.prepared_epc["roof_description"] - self.mainheat_description: str = self.prepared_epc["mainheat_description"] - self.hotwater_description: str = self.prepared_epc["hotwater_description"] - self.main_fuel: str = self.prepared_epc["main_fuel"] - self.mechanical_ventilation: str = self.prepared_epc["mechanical_ventilation"] - self.secondheat_description: str = self.prepared_epc["secondheat_description"] - self.windows_description: str = self.prepared_epc["windows_description"] - self.glazed_type: str = self.prepared_epc["glazed_type"] - self.multi_glaze_proportion: float = float( - self.prepared_epc["multi_glaze_proportion"] - ) - self.low_energy_lighting: float = float( - self.prepared_epc["low_energy_lighting"] - ) - self.number_open_fireplaces: float = float( - self.prepared_epc["number_open_fireplaces"] - ) - self.mainheatcont_description: str = self.prepared_epc[ - "mainheatcont_description" - ] - self.solar_water_heating_flag: str = self.prepared_epc[ - "solar_water_heating_flag" - ] - self.photo_supply: float = float(self.prepared_epc["photo_supply"]) - self.transaction_type: str = self.prepared_epc["transaction_type"] - self.energy_tariff: str = self.prepared_epc["energy_tariff"] - self.extension_count: float = float(self.prepared_epc["extension_count"]) - self.total_floor_area: float = float(self.prepared_epc["total_floor_area"]) - self.floor_height: float = float(self.prepared_epc["floor_height"]) - self.hot_water_energy_eff: str = self.prepared_epc["hot_water_energy_eff"] - self.floor_energy_eff: str = self.prepared_epc["floor_energy_eff"] - self.windows_energy_eff: str = self.prepared_epc["windows_energy_eff"] - self.walls_energy_eff: str = self.prepared_epc["walls_energy_eff"] - self.sheating_energy_eff: str = self.prepared_epc["sheating_energy_eff"] - self.roof_energy_eff: str = self.prepared_epc["roof_energy_eff"] - self.mainheat_energy_eff: str = self.prepared_epc["mainheat_energy_eff"] - self.mainheatc_energy_eff: str = self.prepared_epc["mainheatc_energy_eff"] - self.lighting_energy_eff: str = self.prepared_epc["lighting_energy_eff"] - self.lighting_cost_current: float = self.prepared_epc["lighting_cost_current"] - self.heating_cost_current: float = self.prepared_epc["heating_cost_current"] - self.hot_water_cost_current: float = self.prepared_epc["hot_water_cost_current"] - self.potential_energy_efficiency: float = float( - self.prepared_epc["potential_energy_efficiency"] - ) - self.environment_impact_potential: float = float( - self.prepared_epc["environment_impact_potential"] - ) - self.energy_consumption_potential: float = float( - self.prepared_epc["energy_consumption_potential"] - ) - self.co2_emissions_potential: float = float( - self.prepared_epc["co2_emissions_potential"] - ) - self.lodgement_date: str = self.prepared_epc["lodgement_date"] - self.current_energy_efficiency: int = int( - self.prepared_epc["current_energy_efficiency"] - ) - self.energy_consumption_current: int = int( - self.prepared_epc["energy_consumption_current"] - ) - self.co2_emissions_current: float = float( - self.prepared_epc["co2_emissions_current"] - ) - self.number_habitable_rooms: float = float( - self.prepared_epc["number_habitable_rooms"] - ) - self.number_heated_rooms: float = float( - self.prepared_epc["number_heated_rooms"] - ) - self.is_post_sap10: bool = bool(self.prepared_epc["is_post_sap10"]) + for key, value in self.prepared_epc.items(): + + # Enforce schema consistency + if "-" in key: + raise ValueError(f"Invalid EPC key format (expected snake_case): {key}") + + if key not in field_map: + # Ignore keys that are not part of the dataclass schema + continue + + if value in ("", None): + setattr(self, key, None) + continue + + try: + cast_value = self._cast_value(value, field_map[key].type) + setattr(self, key, cast_value) + + except Exception as e: + logger.error(f"Failed casting field '{key}' with value '{value}': {e}") + setattr(self, key, value) def _identify_delta_between_prepared_and_original_records(self): """ diff --git a/etl/epc/tests/test_epcrecord.py b/etl/epc/tests/test_epcrecord.py index feb39c8e..0d862acc 100644 --- a/etl/epc/tests/test_epcrecord.py +++ b/etl/epc/tests/test_epcrecord.py @@ -416,3 +416,162 @@ class TestEpcRecord: ) assert prepared_epc.get("year_built") == 1900 + + def test_casting(self, cleaning_data): + # Tests expected type casting, against previously hard-coded expectations to ensure that the + # expected types are correct and that we don't accidentally change them in future + + test_epc_records = { + 'original_epc': { + 'uprn': '100023417525', 'county': 'Greater London Authority', 'tenure': 'rental (social)', + 'address': '31 Mimosa House, Larch Crescent', + 'lmk-key': '201660309922019061719223615438661', 'address1': '31 Mimosa House', + 'address2': 'Larch Crescent', 'address3': '', 'postcode': 'UB4 9DH', 'posttown': 'HAYES', + 'main-fuel': 'mains gas (not community)', 'built-form': 'Mid-Terrace', 'floor-level': 2, + 'glazed-area': 'Normal', 'glazed-type': 'double glazing, unknown install date', + 'report-type': '100', 'uprn-source': 'Address Matched', 'constituency': 'E14000737', + 'floor-height': 2.39, 'photo-supply': None, 'roof-env-eff': 'Average', + 'energy-tariff': 'Single', 'floor-env-eff': 'N/A', 'property-type': 'Maisonette', + 'walls-env-eff': 'Average', 'lodgement-date': '2019-06-17', 'mains-gas-flag': True, + 'extension-count': 0, 'flat-top-storey': 'Y', 'inspection-date': '2019-06-17', + 'local-authority': 'E09000017', 'roof-energy-eff': 'Average', 'windows-env-eff': 'Average', + 'floor-energy-eff': 'NO DATA!', 'lighting-env-eff': 'Good', 'mainheat-env-eff': 'Good', + 'roof-description': 'Pitched, 100 mm loft insulation', 'sheating-env-eff': 'N/A', + 'total-floor-area': 67.0, 'transaction-type': 'rental (social)', + 'walls-energy-eff': 'Average', 'flat-storey-count': None, + 'floor-description': '(another dwelling below)', 'hot-water-env-eff': 'Good', + 'mainheatc-env-eff': 'Average', 'walls-description': 'Cavity wall, filled cavity', + 'constituency-label': 'Hayes and Harlington', 'heat-loss-corridor': 'no corridor', + 'lodgement-datetime': '2019-06-17 19:22:36', 'wind-turbine-count': 0, + 'windows-energy-eff': 'Average', 'lighting-energy-eff': 'Good', + 'low-energy-lighting': '67', 'mainheat-energy-eff': 'Good', 'number-heated-rooms': 3.0, + 'sheating-energy-eff': 'N/A', 'windows-description': 'Fully double glazed', + 'heating-cost-current': '310', 'hot-water-energy-eff': 'Good', + 'hotwater-description': 'From main system', + 'lighting-description': 'Low energy lighting in 67% of fixed outlets', + 'mainheat-description': 'Boiler and radiators, mains gas', + 'mainheatc-energy-eff': 'Average', 'co2-emissions-current': 2.1, + 'construction-age-band': 'England and Wales: 1950-1966', 'current-energy-rating': 'C', + 'lighting-cost-current': '70', 'local-authority-label': 'Hillingdon', + 'main-heating-controls': '2104', 'heating-cost-potential': '265', + 'hot-water-cost-current': '136', 'mechanical-ventilation': 'natural', + 'multi-glaze-proportion': '100', 'number-habitable-rooms': 3.0, + 'number-open-fireplaces': 0, 'secondheat-description': 'None', + 'co2-emissions-potential': 1.7, 'lighting-cost-potential': '53', + 'potential-energy-rating': 'C', 'hot-water-cost-potential': '106', + 'mainheatcont-description': 'Programmer and room thermostat', + 'solar-water-heating-flag': 'N', 'unheated-corridor-length': None, + 'building-reference-number': '6110075568', 'current-energy-efficiency': 73, + 'energy-consumption-current': 180.0, 'environment-impact-current': '72', + 'potential-energy-efficiency': 77, 'energy-consumption-potential': '141', + 'environment-impact-potential': '78', 'fixed-lighting-outlets-count': 9, + 'low-energy-fixed-light-count': '', 'co2-emiss-curr-per-floor-area': '32' + }, + 'full_sap_epc': {}, + 'old_data': [ + {'uprn': '100023417525', 'county': 'Greater London Authority', 'tenure': 'rental (social)', + 'address': '31 Mimosa House, Larch Crescent', 'lmk-key': '201660300922008121514105815828768', + 'address1': '31 Mimosa House', 'address2': 'Larch Crescent', 'address3': '', 'postcode': 'UB4 9DH', + 'posttown': 'HAYES', + 'main-fuel': 'mains gas - this is for backwards compatibility only and should not be used', + 'built-form': 'Mid-Terrace', 'floor-level': '2nd', 'glazed-area': 'Normal', + 'glazed-type': 'double glazing, unknown install date', 'report-type': '100', + 'uprn-source': 'Address Matched', 'constituency': 'E14000737', 'floor-height': '2.36', + 'photo-supply': '0.0', 'roof-env-eff': 'Good', 'energy-tariff': 'Single', 'floor-env-eff': 'N/A', + 'property-type': 'Flat', 'walls-env-eff': 'Poor', 'lodgement-date': '2008-12-15', + 'mains-gas-flag': 'Y', 'extension-count': '0', 'flat-top-storey': 'Y', 'inspection-date': '2008-12-12', + 'local-authority': 'E09000017', 'roof-energy-eff': 'Good', 'windows-env-eff': 'Average', + 'floor-energy-eff': 'N/A', 'lighting-env-eff': 'Good', 'mainheat-env-eff': 'Good', + 'roof-description': 'Pitched, 150 mm loft insulation', 'sheating-env-eff': 'N/A', + 'total-floor-area': '69.8', 'transaction-type': 'rental (social)', 'walls-energy-eff': 'Poor', + 'flat-storey-count': '4.0', 'floor-description': '(other premises below)', 'hot-water-env-eff': 'Good', + 'mainheatc-env-eff': 'Poor', 'walls-description': 'Cavity wall, as built, no insulation (assumed)', + 'constituency-label': 'Hayes and Harlington', 'heat-loss-corridor': 'no corridor', + 'lodgement-datetime': '2008-12-15 14:10:58', 'wind-turbine-count': '0', + 'windows-energy-eff': 'Average', 'lighting-energy-eff': 'Good', 'low-energy-lighting': '56', + 'mainheat-energy-eff': 'Good', 'number-heated-rooms': '3', 'sheating-energy-eff': 'N/A', + 'windows-description': 'Fully double glazed', 'heating-cost-current': '315', + 'hot-water-energy-eff': 'Good', 'hotwater-description': 'From main system', + 'lighting-description': 'Low energy lighting in 56% of fixed outlets', + 'mainheat-description': 'Boiler and radiators, mains gas', 'mainheatc-energy-eff': 'Poor', + 'co2-emissions-current': '2.8', 'construction-age-band': 'England and Wales: 1967-1975', + 'current-energy-rating': 'C', 'lighting-cost-current': '46', 'local-authority-label': 'Hillingdon', + 'main-heating-controls': '2104', 'heating-cost-potential': '207', 'hot-water-cost-current': '119', + 'mechanical-ventilation': 'natural', 'multi-glaze-proportion': '100', 'number-habitable-rooms': '3', + 'number-open-fireplaces': '0', 'secondheat-description': 'None', 'co2-emissions-potential': '1.7', + 'lighting-cost-potential': '32', 'potential-energy-rating': 'B', 'hot-water-cost-potential': '96', + 'mainheatcont-description': 'Programmer and room thermostat', 'solar-water-heating-flag': 'N', + 'unheated-corridor-length': '', 'building-reference-number': '6110075568', + 'current-energy-efficiency': '71', 'energy-consumption-current': '239', + 'environment-impact-current': '67', 'potential-energy-efficiency': '82', + 'energy-consumption-potential': '148', 'environment-impact-potential': '80', + 'fixed-lighting-outlets-count': '', 'low-energy-fixed-light-count': '', + 'co2-emiss-curr-per-floor-area': '40'} + ] + } + + record = EPCRecord( + epc_records=test_epc_records, + run_mode="newdata", + cleaning_data=cleaning_data + ) + + expected_types = { + "uprn": int, + "walls_description": str, + "floor_description": str, + "lighting_description": str, + "roof_description": str, + "mainheat_description": str, + "hotwater_description": str, + "main_fuel": str, + "mechanical_ventilation": str, + "secondheat_description": str, + "windows_description": str, + "glazed_type": str, + "multi_glaze_proportion": float, + "low_energy_lighting": float, + "number_open_fireplaces": float, + "mainheatcont_description": str, + "solar_water_heating_flag": str, + "photo_supply": float, + "transaction_type": str, + "energy_tariff": str, + "extension_count": float, + "total_floor_area": float, + "floor_height": float, + "hot_water_energy_eff": str, + "floor_energy_eff": None, # THe input is NO DATA so we map to None + "windows_energy_eff": str, + "walls_energy_eff": str, + "sheating_energy_eff": None, + "roof_energy_eff": str, + "mainheat_energy_eff": str, + "mainheatc_energy_eff": str, + "lighting_energy_eff": str, + "lighting_cost_current": float, + "heating_cost_current": float, + "hot_water_cost_current": float, + "potential_energy_efficiency": float, + "environment_impact_potential": float, + "energy_consumption_potential": float, + "co2_emissions_potential": float, + "lodgement_date": str, + "current_energy_efficiency": int, + "energy_consumption_current": int, + "co2_emissions_current": float, + "number_habitable_rooms": float, + "number_heated_rooms": float, + "is_post_sap10": bool, + } + + for field, expected_type in expected_types.items(): + value = getattr(record, field) + + if expected_type is None: + assert value is None, f"{field} expected to be None, got {value}" + continue + + assert isinstance( + value, expected_type + ), f"{field} expected {expected_type}, got {type(value)}"