mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added separate detection of tariff
This commit is contained in:
parent
d6b4403cfe
commit
c2ebb49b18
3 changed files with 37 additions and 10 deletions
|
|
@ -58,7 +58,7 @@ def handler():
|
|||
df = df.reset_index(drop=True)
|
||||
|
||||
import numpy as np
|
||||
idx = 36
|
||||
idx = 1
|
||||
record = df[df.index == idx].to_dict("records")[0]
|
||||
record = {k: v for k, v in record.items() if v not in [None, np.nan, False]}
|
||||
from pprint import pprint
|
||||
|
|
|
|||
|
|
@ -11,32 +11,41 @@ class MainFuelAttributes:
|
|||
'biomass',
|
||||
'biodiesel',
|
||||
# Note: there is als a category called 'bottled LPG', but only 2/50k entries had this
|
||||
'LPG',
|
||||
'lpg',
|
||||
'waste combustion',
|
||||
'biogas',
|
||||
'wood logs',
|
||||
'dual fuel - mineral + wood',
|
||||
'dual fuel mineral wood',
|
||||
'gas',
|
||||
'anthracite',
|
||||
'smokeless coal',
|
||||
'house coal'
|
||||
]
|
||||
|
||||
NO_FUEL_KEYWORDS = [
|
||||
'To be used only when there is no heating/hot-water system or data is from a community network'
|
||||
TARIFF_KEYWORDS = [
|
||||
'unspecified tariff'
|
||||
# We may come across more later but this is all observed for now
|
||||
]
|
||||
|
||||
UNKNOWN_FUEL = "unknown"
|
||||
|
||||
NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK = [
|
||||
'to be used only when there is no heatinghotwater system or data is from a community network'
|
||||
]
|
||||
|
||||
def __init__(self, description: str):
|
||||
self.description: str = remove_punctuation(clean_description(description.lower()))
|
||||
|
||||
self.nodata = not description or any(keyword in self.description for keyword in self.NO_FUEL_KEYWORDS)
|
||||
|
||||
self.is_community = False if 'not community' in self.description else 'community' in self.description
|
||||
self.is_unknown = False
|
||||
self.nodata = not description
|
||||
|
||||
if not self.nodata and not any(
|
||||
self._keyword_in_description(keywords)
|
||||
for keywords in [
|
||||
self.FUEL_KEYWORDS
|
||||
self.FUEL_KEYWORDS,
|
||||
self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK,
|
||||
self.TARIFF_KEYWORDS
|
||||
]
|
||||
):
|
||||
raise ValueError('Invalid description')
|
||||
|
|
@ -47,10 +56,18 @@ class MainFuelAttributes:
|
|||
def process(self) -> Dict[str, Union[str, bool]]:
|
||||
result: Dict[str, Union[str, bool]] = {
|
||||
"fuel_type": self._find_keyword(self.FUEL_KEYWORDS),
|
||||
"no_fuel": self._find_keyword(self.NO_FUEL_KEYWORDS),
|
||||
"is_community": self.is_community
|
||||
"tariff_type": self._find_keyword(self.TARIFF_KEYWORDS),
|
||||
"is_community": self.is_community,
|
||||
"no_individual_heating_or_community_network": self._find_keyword(
|
||||
self.NO_INDIVIDUAL_HEATING_OR_COMMUNITY_NETWORK
|
||||
),
|
||||
}
|
||||
|
||||
if not result["fuel_type"]:
|
||||
result["fuel_type"] = self.UNKNOWN_FUEL
|
||||
# We'll do checks on unknown fuel types to ensure we don't miss anything
|
||||
self.is_unknown = True
|
||||
|
||||
return result
|
||||
|
||||
def _find_keyword(self, keywords):
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from typing import Tuple, Union, Dict, List
|
|||
|
||||
THERMAL_TRANSMITTENCE_STR = r"average thermal transmittance (-?\d+\.\d+)\s(w/m-¦k)"
|
||||
THERMAL_TRANSMITTANCE_REGEX = re.compile(THERMAL_TRANSMITTENCE_STR)
|
||||
DOUBLE_SPACE_PATTERN = re.compile(r"\s+")
|
||||
|
||||
|
||||
def extract_thermal_transmittance(result: dict, description: str) -> Tuple[
|
||||
|
|
@ -59,6 +60,8 @@ def clean_description(description: str) -> str:
|
|||
special_chars = [":", ";", "*", "@", "?", "!", "(", ")"]
|
||||
for char in special_chars:
|
||||
description = description.replace(char, " ")
|
||||
|
||||
description = remove_double_spaces(description)
|
||||
return description
|
||||
|
||||
|
||||
|
|
@ -103,4 +106,11 @@ def remove_punctuation(text: str) -> str:
|
|||
# Use the translation table to remove punctuation from the text
|
||||
text_without_punctuation = text.translate(translation_table)
|
||||
|
||||
text_without_punctuation = remove_double_spaces(text_without_punctuation)
|
||||
|
||||
return text_without_punctuation
|
||||
|
||||
|
||||
def remove_double_spaces(text):
|
||||
cleaned_text = DOUBLE_SPACE_PATTERN.sub(" ", text)
|
||||
return cleaned_text
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue