mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added spelling correction using nlp textblob
This commit is contained in:
parent
d0f55b2e7d
commit
d9b792d23b
5 changed files with 61 additions and 12 deletions
|
|
@ -79,12 +79,16 @@ class SalModel:
|
|||
self.fit_error = None
|
||||
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
||||
|
||||
def _append_extracted_u_values(self, model_data):
|
||||
def _append_cleaned_data(self, model_data):
|
||||
"""
|
||||
We need to estimate the u-value impact for:
|
||||
1) Walls
|
||||
2) Roof
|
||||
3) Floors
|
||||
We append this data on
|
||||
|
||||
Additionally, we append on the extracted proportion of low energy lighting, which
|
||||
is moreliably extracted that using the low-energy-lighting column
|
||||
"""
|
||||
|
||||
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
||||
|
|
@ -102,6 +106,9 @@ class SalModel:
|
|||
columns={"thermal_transmittance": "roof_u_value", }
|
||||
)
|
||||
|
||||
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
|
||||
["original_description", "low_energy_proportion"]]
|
||||
|
||||
model_data = model_data.merge(
|
||||
wall_u_values,
|
||||
how="left",
|
||||
|
|
@ -149,7 +156,7 @@ class SalModel:
|
|||
model_data["idx"] = model_data.index.copy()
|
||||
|
||||
# Append on u-values
|
||||
model_data = self._append_extracted_u_values(model_data)
|
||||
model_data = self._append_cleaned_data(model_data)
|
||||
|
||||
# Convert transaction_type
|
||||
model_data = self._convert_transaction_type(model_data)
|
||||
|
|
@ -264,11 +271,3 @@ self = SalModel(
|
|||
data=all_data["data"],
|
||||
cleaner=all_data["cleaner"]
|
||||
)
|
||||
|
||||
descs = []
|
||||
for x in all_data["data"]:
|
||||
descs.append(x["lighting-description"])
|
||||
|
||||
descs = list(set(descs))
|
||||
|
||||
df = pd.DataFrame(all_data['data'])
|
||||
|
|
|
|||
|
|
@ -1,11 +1,16 @@
|
|||
import re
|
||||
from model_data.epc_attributes.attribute_utils import clean_description
|
||||
from model_data.utils import correct_spelling
|
||||
|
||||
|
||||
class LightingAttributes:
|
||||
|
||||
def __init__(self, description, averages):
|
||||
self.description: str = clean_description(description.lower())
|
||||
self.description = correct_spelling(self.description)
|
||||
self.averages = averages
|
||||
# Correct spelling mistakes in averages
|
||||
# self.averages["lighting-description"] = self.averages["lighting-description"].apply(correct_spelling)
|
||||
|
||||
def process(self):
|
||||
|
||||
|
|
|
|||
|
|
@ -16,4 +16,6 @@ geopandas
|
|||
mip
|
||||
seaborn
|
||||
statsmodels
|
||||
scikit-learn
|
||||
scikit-learn
|
||||
pyspellchecker
|
||||
textblob
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import logging
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
from model_data.utils import setup_logger
|
||||
from model_data.utils import setup_logger, is_percentage_or_number, correct_spelling
|
||||
|
||||
|
||||
class TestLogger:
|
||||
|
|
@ -47,3 +47,20 @@ class TestLogger:
|
|||
assert log_stream.read() == "Hello World!\n"
|
||||
# remove the handler after use
|
||||
logger.removeHandler(handler)
|
||||
|
||||
def test_is_percentage_or_number(self):
|
||||
assert is_percentage_or_number("88")
|
||||
assert is_percentage_or_number("88%")
|
||||
assert not is_percentage_or_number("abc")
|
||||
assert not is_percentage_or_number("")
|
||||
assert not is_percentage_or_number("88.0") # only integer numbers or percentages
|
||||
assert is_percentage_or_number("101%") # numbers over 100 allowed
|
||||
assert not is_percentage_or_number("-1") # negative numbers not allowed
|
||||
|
||||
def test_correct_spelling(self):
|
||||
assert correct_spelling("speling") == "spelling"
|
||||
assert correct_spelling("88") == "88" # numbers are left unchanged
|
||||
assert correct_spelling("corerct") == "correct"
|
||||
assert correct_spelling("excllent") == "excellent"
|
||||
assert correct_spelling("") == "" # empty string should return an empty string
|
||||
assert correct_spelling("88%") == "88%" # percentages are left unchanged
|
||||
|
|
|
|||
|
|
@ -1,4 +1,9 @@
|
|||
import logging
|
||||
import re
|
||||
from textblob import TextBlob
|
||||
|
||||
# Pre-compile the regular expression
|
||||
PERCENTAGE_PATTERN = re.compile(r'^\d+%?$')
|
||||
|
||||
|
||||
def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
|
||||
|
|
@ -33,3 +38,24 @@ def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
|
|||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def is_percentage_or_number(s):
|
||||
# re.match returns None if the string does not match the pattern
|
||||
return PERCENTAGE_PATTERN.match(s) is not None
|
||||
|
||||
|
||||
def correct_spelling(text):
|
||||
words = text.split()
|
||||
|
||||
corrected_words = []
|
||||
for word in words:
|
||||
if is_percentage_or_number(word):
|
||||
corrected_words.append(word)
|
||||
else:
|
||||
blob = TextBlob(word) # create a TextBlob object
|
||||
corrected_word = blob.correct() # use the correct method to correct spelling
|
||||
corrected_words.append(str(corrected_word)) # convert corrected word back to string
|
||||
|
||||
corrected_text = ' '.join(corrected_words)
|
||||
return corrected_text
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue