debugging cleaning class

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-07 15:06:13 +03:00
parent 1b84033d0b
commit 6e1607bbba
14 changed files with 52 additions and 8 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -373,6 +373,7 @@ async def trigger_plan(body: PlanTriggerRequest):
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
# TODO: Set the TRANSACTION_TYPE
# Clean the data
cleaning_data = read_parquet_from_s3(
bucket_name="retrofit-data-dev",

View file

@ -2,6 +2,8 @@ from typing import List, Dict, Any
from collections import Counter
from collections import defaultdict
import pandas as pd
from model_data.utils import correct_spelling
from model_data.epc_attributes.FloorAttributes import FloorAttributes
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
@ -97,7 +99,7 @@ class EpcClean:
self._init_empty_cleaned_obj()
for field in self.CLEANING_FIELDS:
self.unique_vals[field] = Counter([v[field] for v in self.data])
self.unique_vals[field] = Counter([v[field] for v in self.data if not pd.isnull(v[field])])
self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes)
self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes)

View file

@ -14,6 +14,7 @@ class FloorAttributes(Definitions):
WELSH_TEXT = {
"(anheddiad arall islaw)": "(another dwelling below)",
"solet, dim inswleiddio (rhagdybiaeth)": "dolid, no insulation (assumed)"
}
def __init__(self, description: str):

View file

@ -15,7 +15,8 @@ class HotWaterAttributes(Definitions):
'oil boiler', # A boiler that uses oil as fuel to heat water
'electric instantaneous', # Similar to gas instantaneous, but uses electricity as its energy source
'gas multipoint', # A gas water heater that can supply hot water to multiple points of use at once
'heat pump' # A general category for heat pumps, regardless of the energy source
'heat pump', # A general category for heat pumps, regardless of the energy source
'solid fuel boiler' # burns solid materials to generate heat for water heating and/or space heating
]
# SYSTEM_TYPES refer to the larger system within which the heater operates.
@ -83,6 +84,7 @@ class HotWaterAttributes(Definitions):
# not common, especially in modern homes.
APPLIANCE_SYSTEMS = [
'gas range cooker', # A gas-powered range cooker
'oil range cooker'
]
# Descriptions which represent the same thing
@ -92,6 +94,7 @@ class HotWaterAttributes(Definitions):
WELSH_TEXT = {
"ogçör brif system": "from main system",
"ogçör brif system, adfer gwres nwyon ffliw": "from main system, flue gas heat recovery"
}
def __init__(self, description: str):
@ -118,6 +121,7 @@ class HotWaterAttributes(Definitions):
self.CHP_SYSTEMS,
self.NO_SYSTEM_PRESENT_KEYWORDS,
self.APPLIANCE_SYSTEMS,
self.DISTRIBUTION_SYSTEM_KEYWORDS
]
):
raise ValueError('Invalid description')

View file

@ -4,9 +4,18 @@ from model_data.utils import correct_spelling
class LightingAttributes:
WELSH_TEXT = {
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets"
}
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
translation = self.WELSH_TEXT.get(self.description)
if translation:
self.nodata = False
self.description = translation
self.description = correct_spelling(self.description)
self.averages = averages

View file

@ -26,6 +26,8 @@ class MainFuelAttributes(Definitions):
# Wood pellets have a higher energy density than wood chips. This is due to their manufacturing process,
# which compresses the wood and removes most of the moisture, making them more efficient as a fuel
'wood pellets',
'b30k',
'dual fuel appliance mineral and wood',
]
COMPLEX_FUEL_KEYWORDS = [

View file

@ -1,5 +1,5 @@
from model_data.BaseUtility import Definitions
from model_data.epc_attributes.attribute_utils import clean_description, process_part
from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
from typing import Dict, Union
@ -25,7 +25,10 @@ class MainHeatAttributes(Definitions):
}
def __init__(self, description: str):
self.description: str = clean_description(description.lower())
self.description = switch_chars(description.lower())
self.description: str = clean_description(self.description)
# Remove special characters
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES

View file

@ -10,6 +10,7 @@ class RoofAttributes(Definitions):
WELSH_TEXT = {
"ar oleddf, dim inswleiddio": "pitched, no insulation",
"ar oleddf, 150 mm o inswleiddio yn y llofft": "pitched, 150 mm loft insulation"
}
def __init__(self, description: str):

View file

@ -65,6 +65,20 @@ def clean_description(description: str) -> str:
return description
def switch_chars(description: str) -> str:
"""
Switches specified characters in a description with a ,
Useful for descriptions like "Gas: mains gas"
"""
# Switch : to ,
chars = [":"]
for char in chars:
description = description.replace(char, ",")
return description
def process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str):
"""
Process a part of the description with a given list of epc_attributes

View file

@ -30,5 +30,6 @@ test_cases = [
{'original_description': 'Excellent lighting efficiency', 'low_energy_proportion': 1.0},
{'original_description': 'Low energy lighting in 2% of fixed outlets', 'low_energy_proportion': 0.02},
{'original_description': 'No Low energy lighting', 'low_energy_proportion': 0},
{'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6}
{'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6},
{'original_description': 'Goleuadau ynni-isel ym mhob un oGÇÖr mannau gosod', 'low_energy_proportion': 1},
]

View file

@ -60,5 +60,10 @@ mainfuel_cases = [
{'original_description': 'wood chips', 'fuel_type': 'wood chips', 'tariff_type': None, 'is_community': False,
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
{'original_description': 'wood pellets', 'fuel_type': 'wood pellets', 'tariff_type': None, 'is_community': False,
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None}
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
{'original_description': 'Solid fuel: dual fuel appliance (mineral and wood)',
'fuel_type': 'dual fuel appliance mineral and wood',
'tariff_type': None, 'is_community': False,
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
]

View file

@ -58,4 +58,5 @@ functions:
- http:
path: /predict
method: POST
async: true # Enable async for long running tasks
timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed