From 6e1607bbba6b014a62ff7ea98345ae70f8f630c7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Sep 2023 15:06:13 +0300 Subject: [PATCH] debugging cleaning class --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 1 + model_data/EpcClean.py | 4 +++- model_data/epc_attributes/FloorAttributes.py | 1 + model_data/epc_attributes/HotWaterAttributes.py | 6 +++++- model_data/epc_attributes/LightingAttributes.py | 9 +++++++++ model_data/epc_attributes/MainFuelAttributes.py | 2 ++ model_data/epc_attributes/MainheatAttributes.py | 7 +++++-- model_data/epc_attributes/RoofAttributes.py | 1 + model_data/epc_attributes/attribute_utils.py | 14 ++++++++++++++ .../test_data/test_lighting_attributes_cases.py | 3 ++- .../test_data/test_main_fuel_attributes_cases.py | 7 ++++++- sapmodel.serverless.yml | 1 + 14 files changed, 52 insertions(+), 8 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..b03b31b1 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ca0e1cd9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 4b972a6a..8f1413ee 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -373,6 +373,7 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) + # TODO: Set the TRANSACTION_TYPE # Clean the data cleaning_data = read_parquet_from_s3( bucket_name="retrofit-data-dev", diff --git a/model_data/EpcClean.py b/model_data/EpcClean.py index c8594de8..adec9978 100644 --- a/model_data/EpcClean.py +++ b/model_data/EpcClean.py @@ -2,6 +2,8 @@ from typing import List, Dict, Any from collections import Counter from collections import defaultdict +import pandas as pd + from model_data.utils import correct_spelling from model_data.epc_attributes.FloorAttributes import FloorAttributes from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes @@ -97,7 +99,7 @@ class EpcClean: self._init_empty_cleaned_obj() for field in self.CLEANING_FIELDS: - self.unique_vals[field] = Counter([v[field] for v in self.data]) + self.unique_vals[field] = Counter([v[field] for v in self.data if not pd.isnull(v[field])]) self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes) self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes) diff --git a/model_data/epc_attributes/FloorAttributes.py b/model_data/epc_attributes/FloorAttributes.py index 024ec6dc..71a8b5a8 100644 --- a/model_data/epc_attributes/FloorAttributes.py +++ b/model_data/epc_attributes/FloorAttributes.py @@ -14,6 +14,7 @@ class FloorAttributes(Definitions): WELSH_TEXT = { "(anheddiad arall islaw)": "(another dwelling below)", + "solet, dim inswleiddio (rhagdybiaeth)": "dolid, no insulation (assumed)" } def __init__(self, description: str): diff --git a/model_data/epc_attributes/HotWaterAttributes.py b/model_data/epc_attributes/HotWaterAttributes.py index 97664416..2535032b 100644 --- a/model_data/epc_attributes/HotWaterAttributes.py +++ b/model_data/epc_attributes/HotWaterAttributes.py @@ -15,7 +15,8 @@ class HotWaterAttributes(Definitions): 'oil boiler', # A boiler that uses oil as fuel to heat water 'electric instantaneous', # Similar to gas instantaneous, but uses electricity as its energy source 'gas multipoint', # A gas water heater that can supply hot water to multiple points of use at once - 'heat pump' # A general category for heat pumps, regardless of the energy source + 'heat pump', # A general category for heat pumps, regardless of the energy source + 'solid fuel boiler' # burns solid materials to generate heat for water heating and/or space heating ] # SYSTEM_TYPES refer to the larger system within which the heater operates. @@ -83,6 +84,7 @@ class HotWaterAttributes(Definitions): # not common, especially in modern homes. APPLIANCE_SYSTEMS = [ 'gas range cooker', # A gas-powered range cooker + 'oil range cooker' ] # Descriptions which represent the same thing @@ -92,6 +94,7 @@ class HotWaterAttributes(Definitions): WELSH_TEXT = { "ogçör brif system": "from main system", + "ogçör brif system, adfer gwres nwyon ffliw": "from main system, flue gas heat recovery" } def __init__(self, description: str): @@ -118,6 +121,7 @@ class HotWaterAttributes(Definitions): self.CHP_SYSTEMS, self.NO_SYSTEM_PRESENT_KEYWORDS, self.APPLIANCE_SYSTEMS, + self.DISTRIBUTION_SYSTEM_KEYWORDS ] ): raise ValueError('Invalid description') diff --git a/model_data/epc_attributes/LightingAttributes.py b/model_data/epc_attributes/LightingAttributes.py index 92c03846..452caa7a 100644 --- a/model_data/epc_attributes/LightingAttributes.py +++ b/model_data/epc_attributes/LightingAttributes.py @@ -4,9 +4,18 @@ from model_data.utils import correct_spelling class LightingAttributes: + WELSH_TEXT = { + "goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets" + } def __init__(self, description, averages): self.description: str = clean_description(description.lower()) + + translation = self.WELSH_TEXT.get(self.description) + if translation: + self.nodata = False + self.description = translation + self.description = correct_spelling(self.description) self.averages = averages diff --git a/model_data/epc_attributes/MainFuelAttributes.py b/model_data/epc_attributes/MainFuelAttributes.py index 055f4cac..363d0ec3 100644 --- a/model_data/epc_attributes/MainFuelAttributes.py +++ b/model_data/epc_attributes/MainFuelAttributes.py @@ -26,6 +26,8 @@ class MainFuelAttributes(Definitions): # Wood pellets have a higher energy density than wood chips. This is due to their manufacturing process, # which compresses the wood and removes most of the moisture, making them more efficient as a fuel 'wood pellets', + 'b30k', + 'dual fuel appliance mineral and wood', ] COMPLEX_FUEL_KEYWORDS = [ diff --git a/model_data/epc_attributes/MainheatAttributes.py b/model_data/epc_attributes/MainheatAttributes.py index 492c3123..70e78ee0 100644 --- a/model_data/epc_attributes/MainheatAttributes.py +++ b/model_data/epc_attributes/MainheatAttributes.py @@ -1,5 +1,5 @@ from model_data.BaseUtility import Definitions -from model_data.epc_attributes.attribute_utils import clean_description, process_part +from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars from typing import Dict, Union @@ -25,7 +25,10 @@ class MainHeatAttributes(Definitions): } def __init__(self, description: str): - self.description: str = clean_description(description.lower()) + + self.description = switch_chars(description.lower()) + + self.description: str = clean_description(self.description) # Remove special characters self.nodata = not description or description in self.DATA_ANOMALY_MATCHES diff --git a/model_data/epc_attributes/RoofAttributes.py b/model_data/epc_attributes/RoofAttributes.py index df1ce977..892217b6 100644 --- a/model_data/epc_attributes/RoofAttributes.py +++ b/model_data/epc_attributes/RoofAttributes.py @@ -10,6 +10,7 @@ class RoofAttributes(Definitions): WELSH_TEXT = { "ar oleddf, dim inswleiddio": "pitched, no insulation", + "ar oleddf, 150 mm o inswleiddio yn y llofft": "pitched, 150 mm loft insulation" } def __init__(self, description: str): diff --git a/model_data/epc_attributes/attribute_utils.py b/model_data/epc_attributes/attribute_utils.py index 9819cc01..a1b65327 100644 --- a/model_data/epc_attributes/attribute_utils.py +++ b/model_data/epc_attributes/attribute_utils.py @@ -65,6 +65,20 @@ def clean_description(description: str) -> str: return description +def switch_chars(description: str) -> str: + """ + Switches specified characters in a description with a , + Useful for descriptions like "Gas: mains gas" + """ + + # Switch : to , + chars = [":"] + for char in chars: + description = description.replace(char, ",") + + return description + + def process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str): """ Process a part of the description with a given list of epc_attributes diff --git a/model_data/tests/test_data/test_lighting_attributes_cases.py b/model_data/tests/test_data/test_lighting_attributes_cases.py index 7ddec1d3..d9e3f01f 100644 --- a/model_data/tests/test_data/test_lighting_attributes_cases.py +++ b/model_data/tests/test_data/test_lighting_attributes_cases.py @@ -30,5 +30,6 @@ test_cases = [ {'original_description': 'Excellent lighting efficiency', 'low_energy_proportion': 1.0}, {'original_description': 'Low energy lighting in 2% of fixed outlets', 'low_energy_proportion': 0.02}, {'original_description': 'No Low energy lighting', 'low_energy_proportion': 0}, - {'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6} + {'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6}, + {'original_description': 'Goleuadau ynni-isel ym mhob un oGÇÖr mannau gosod', 'low_energy_proportion': 1}, ] diff --git a/model_data/tests/test_data/test_main_fuel_attributes_cases.py b/model_data/tests/test_data/test_main_fuel_attributes_cases.py index 8a06c979..49502e88 100644 --- a/model_data/tests/test_data/test_main_fuel_attributes_cases.py +++ b/model_data/tests/test_data/test_main_fuel_attributes_cases.py @@ -60,5 +60,10 @@ mainfuel_cases = [ {'original_description': 'wood chips', 'fuel_type': 'wood chips', 'tariff_type': None, 'is_community': False, 'no_individual_heating_or_community_network': False, 'complex_fuel_type': None}, {'original_description': 'wood pellets', 'fuel_type': 'wood pellets', 'tariff_type': None, 'is_community': False, - 'no_individual_heating_or_community_network': False, 'complex_fuel_type': None} + 'no_individual_heating_or_community_network': False, 'complex_fuel_type': None}, + {'original_description': 'Solid fuel: dual fuel appliance (mineral and wood)', + 'fuel_type': 'dual fuel appliance mineral and wood', + 'tariff_type': None, 'is_community': False, + 'no_individual_heating_or_community_network': False, 'complex_fuel_type': None}, + ] diff --git a/sapmodel.serverless.yml b/sapmodel.serverless.yml index 77d9fc1f..d43609d4 100644 --- a/sapmodel.serverless.yml +++ b/sapmodel.serverless.yml @@ -58,4 +58,5 @@ functions: - http: path: /predict method: POST + async: true # Enable async for long running tasks timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed