Added spelling correction using nlp textblob

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-03 18:12:23 +01:00
parent d0f55b2e7d
commit d9b792d23b
5 changed files with 61 additions and 12 deletions

View file

@ -79,12 +79,16 @@ class SalModel:
self.fit_error = None
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
def _append_extracted_u_values(self, model_data):
def _append_cleaned_data(self, model_data):
"""
We need to estimate the u-value impact for:
1) Walls
2) Roof
3) Floors
We append this data on
Additionally, we append on the extracted proportion of low energy lighting, which
is moreliably extracted that using the low-energy-lighting column
"""
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
@ -102,6 +106,9 @@ class SalModel:
columns={"thermal_transmittance": "roof_u_value", }
)
lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[
["original_description", "low_energy_proportion"]]
model_data = model_data.merge(
wall_u_values,
how="left",
@ -149,7 +156,7 @@ class SalModel:
model_data["idx"] = model_data.index.copy()
# Append on u-values
model_data = self._append_extracted_u_values(model_data)
model_data = self._append_cleaned_data(model_data)
# Convert transaction_type
model_data = self._convert_transaction_type(model_data)
@ -264,11 +271,3 @@ self = SalModel(
data=all_data["data"],
cleaner=all_data["cleaner"]
)
descs = []
for x in all_data["data"]:
descs.append(x["lighting-description"])
descs = list(set(descs))
df = pd.DataFrame(all_data['data'])

View file

@ -1,11 +1,16 @@
import re
from model_data.epc_attributes.attribute_utils import clean_description
from model_data.utils import correct_spelling
class LightingAttributes:
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
self.description = correct_spelling(self.description)
self.averages = averages
# Correct spelling mistakes in averages
# self.averages["lighting-description"] = self.averages["lighting-description"].apply(correct_spelling)
def process(self):

View file

@ -16,4 +16,6 @@ geopandas
mip
seaborn
statsmodels
scikit-learn
scikit-learn
pyspellchecker
textblob

View file

@ -1,7 +1,7 @@
import logging
from io import StringIO
from unittest.mock import patch
from model_data.utils import setup_logger
from model_data.utils import setup_logger, is_percentage_or_number, correct_spelling
class TestLogger:
@ -47,3 +47,20 @@ class TestLogger:
assert log_stream.read() == "Hello World!\n"
# remove the handler after use
logger.removeHandler(handler)
def test_is_percentage_or_number(self):
assert is_percentage_or_number("88")
assert is_percentage_or_number("88%")
assert not is_percentage_or_number("abc")
assert not is_percentage_or_number("")
assert not is_percentage_or_number("88.0") # only integer numbers or percentages
assert is_percentage_or_number("101%") # numbers over 100 allowed
assert not is_percentage_or_number("-1") # negative numbers not allowed
def test_correct_spelling(self):
assert correct_spelling("speling") == "spelling"
assert correct_spelling("88") == "88" # numbers are left unchanged
assert correct_spelling("corerct") == "correct"
assert correct_spelling("excllent") == "excellent"
assert correct_spelling("") == "" # empty string should return an empty string
assert correct_spelling("88%") == "88%" # percentages are left unchanged

View file

@ -1,4 +1,9 @@
import logging
import re
from textblob import TextBlob
# Pre-compile the regular expression
PERCENTAGE_PATTERN = re.compile(r'^\d+%?$')
def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
@ -33,3 +38,24 @@ def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False):
logger.addHandler(console_handler)
return logger
def is_percentage_or_number(s):
# re.match returns None if the string does not match the pattern
return PERCENTAGE_PATTERN.match(s) is not None
def correct_spelling(text):
words = text.split()
corrected_words = []
for word in words:
if is_percentage_or_number(word):
corrected_words.append(word)
else:
blob = TextBlob(word) # create a TextBlob object
corrected_word = blob.correct() # use the correct method to correct spelling
corrected_words.append(str(corrected_word)) # convert corrected word back to string
corrected_text = ' '.join(corrected_words)
return corrected_text