mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
debugging cleaning class
This commit is contained in:
parent
1b84033d0b
commit
6e1607bbba
14 changed files with 52 additions and 8 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -373,6 +373,7 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
|
||||
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
|
||||
|
||||
# TODO: Set the TRANSACTION_TYPE
|
||||
# Clean the data
|
||||
cleaning_data = read_parquet_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@ from typing import List, Dict, Any
|
|||
from collections import Counter
|
||||
from collections import defaultdict
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from model_data.utils import correct_spelling
|
||||
from model_data.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
|
|
@ -97,7 +99,7 @@ class EpcClean:
|
|||
self._init_empty_cleaned_obj()
|
||||
|
||||
for field in self.CLEANING_FIELDS:
|
||||
self.unique_vals[field] = Counter([v[field] for v in self.data])
|
||||
self.unique_vals[field] = Counter([v[field] for v in self.data if not pd.isnull(v[field])])
|
||||
|
||||
self.clean_wrapper(field="floor-description", cleaning_cls=FloorAttributes)
|
||||
self.clean_wrapper(field="hotwater-description", cleaning_cls=HotWaterAttributes)
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ class FloorAttributes(Definitions):
|
|||
|
||||
WELSH_TEXT = {
|
||||
"(anheddiad arall islaw)": "(another dwelling below)",
|
||||
"solet, dim inswleiddio (rhagdybiaeth)": "dolid, no insulation (assumed)"
|
||||
}
|
||||
|
||||
def __init__(self, description: str):
|
||||
|
|
|
|||
|
|
@ -15,7 +15,8 @@ class HotWaterAttributes(Definitions):
|
|||
'oil boiler', # A boiler that uses oil as fuel to heat water
|
||||
'electric instantaneous', # Similar to gas instantaneous, but uses electricity as its energy source
|
||||
'gas multipoint', # A gas water heater that can supply hot water to multiple points of use at once
|
||||
'heat pump' # A general category for heat pumps, regardless of the energy source
|
||||
'heat pump', # A general category for heat pumps, regardless of the energy source
|
||||
'solid fuel boiler' # burns solid materials to generate heat for water heating and/or space heating
|
||||
]
|
||||
|
||||
# SYSTEM_TYPES refer to the larger system within which the heater operates.
|
||||
|
|
@ -83,6 +84,7 @@ class HotWaterAttributes(Definitions):
|
|||
# not common, especially in modern homes.
|
||||
APPLIANCE_SYSTEMS = [
|
||||
'gas range cooker', # A gas-powered range cooker
|
||||
'oil range cooker'
|
||||
]
|
||||
|
||||
# Descriptions which represent the same thing
|
||||
|
|
@ -92,6 +94,7 @@ class HotWaterAttributes(Definitions):
|
|||
|
||||
WELSH_TEXT = {
|
||||
"ogçör brif system": "from main system",
|
||||
"ogçör brif system, adfer gwres nwyon ffliw": "from main system, flue gas heat recovery"
|
||||
}
|
||||
|
||||
def __init__(self, description: str):
|
||||
|
|
@ -118,6 +121,7 @@ class HotWaterAttributes(Definitions):
|
|||
self.CHP_SYSTEMS,
|
||||
self.NO_SYSTEM_PRESENT_KEYWORDS,
|
||||
self.APPLIANCE_SYSTEMS,
|
||||
self.DISTRIBUTION_SYSTEM_KEYWORDS
|
||||
]
|
||||
):
|
||||
raise ValueError('Invalid description')
|
||||
|
|
|
|||
|
|
@ -4,9 +4,18 @@ from model_data.utils import correct_spelling
|
|||
|
||||
|
||||
class LightingAttributes:
|
||||
WELSH_TEXT = {
|
||||
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets"
|
||||
}
|
||||
|
||||
def __init__(self, description, averages):
|
||||
self.description: str = clean_description(description.lower())
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
self.nodata = False
|
||||
self.description = translation
|
||||
|
||||
self.description = correct_spelling(self.description)
|
||||
self.averages = averages
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@ class MainFuelAttributes(Definitions):
|
|||
# Wood pellets have a higher energy density than wood chips. This is due to their manufacturing process,
|
||||
# which compresses the wood and removes most of the moisture, making them more efficient as a fuel
|
||||
'wood pellets',
|
||||
'b30k',
|
||||
'dual fuel appliance mineral and wood',
|
||||
]
|
||||
|
||||
COMPLEX_FUEL_KEYWORDS = [
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from model_data.BaseUtility import Definitions
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, process_part
|
||||
from model_data.epc_attributes.attribute_utils import clean_description, process_part, switch_chars
|
||||
from typing import Dict, Union
|
||||
|
||||
|
||||
|
|
@ -25,7 +25,10 @@ class MainHeatAttributes(Definitions):
|
|||
}
|
||||
|
||||
def __init__(self, description: str):
|
||||
self.description: str = clean_description(description.lower())
|
||||
|
||||
self.description = switch_chars(description.lower())
|
||||
|
||||
self.description: str = clean_description(self.description)
|
||||
# Remove special characters
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ class RoofAttributes(Definitions):
|
|||
|
||||
WELSH_TEXT = {
|
||||
"ar oleddf, dim inswleiddio": "pitched, no insulation",
|
||||
"ar oleddf, 150 mm o inswleiddio yn y llofft": "pitched, 150 mm loft insulation"
|
||||
}
|
||||
|
||||
def __init__(self, description: str):
|
||||
|
|
|
|||
|
|
@ -65,6 +65,20 @@ def clean_description(description: str) -> str:
|
|||
return description
|
||||
|
||||
|
||||
def switch_chars(description: str) -> str:
|
||||
"""
|
||||
Switches specified characters in a description with a ,
|
||||
Useful for descriptions like "Gas: mains gas"
|
||||
"""
|
||||
|
||||
# Switch : to ,
|
||||
chars = [":"]
|
||||
for char in chars:
|
||||
description = description.replace(char, ",")
|
||||
|
||||
return description
|
||||
|
||||
|
||||
def process_part(result: Dict[str, Union[str, bool]], part: str, attr_list: List[str], prefix: str):
|
||||
"""
|
||||
Process a part of the description with a given list of epc_attributes
|
||||
|
|
|
|||
|
|
@ -30,5 +30,6 @@ test_cases = [
|
|||
{'original_description': 'Excellent lighting efficiency', 'low_energy_proportion': 1.0},
|
||||
{'original_description': 'Low energy lighting in 2% of fixed outlets', 'low_energy_proportion': 0.02},
|
||||
{'original_description': 'No Low energy lighting', 'low_energy_proportion': 0},
|
||||
{'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6}
|
||||
{'original_description': 'Goleuadau ynni-isel mewn 60% oGÇÖr mannau gosod', 'low_energy_proportion': 0.6},
|
||||
{'original_description': 'Goleuadau ynni-isel ym mhob un oGÇÖr mannau gosod', 'low_energy_proportion': 1},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -60,5 +60,10 @@ mainfuel_cases = [
|
|||
{'original_description': 'wood chips', 'fuel_type': 'wood chips', 'tariff_type': None, 'is_community': False,
|
||||
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
|
||||
{'original_description': 'wood pellets', 'fuel_type': 'wood pellets', 'tariff_type': None, 'is_community': False,
|
||||
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None}
|
||||
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
|
||||
{'original_description': 'Solid fuel: dual fuel appliance (mineral and wood)',
|
||||
'fuel_type': 'dual fuel appliance mineral and wood',
|
||||
'tariff_type': None, 'is_community': False,
|
||||
'no_individual_heating_or_community_network': False, 'complex_fuel_type': None},
|
||||
|
||||
]
|
||||
|
|
|
|||
|
|
@ -58,4 +58,5 @@ functions:
|
|||
- http:
|
||||
path: /predict
|
||||
method: POST
|
||||
async: true # Enable async for long running tasks
|
||||
timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue