diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index f764bd00..426a56c5 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -79,12 +79,16 @@ class SalModel: self.fit_error = None self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} - def _append_extracted_u_values(self, model_data): + def _append_cleaned_data(self, model_data): """ We need to estimate the u-value impact for: 1) Walls 2) Roof 3) Floors + We append this data on + + Additionally, we append on the extracted proportion of low energy lighting, which + is moreliably extracted that using the low-energy-lighting column """ wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[ @@ -102,6 +106,9 @@ class SalModel: columns={"thermal_transmittance": "roof_u_value", } ) + lighting_proportions = pd.DataFrame(self.cleaner.cleaned["lighting-description"])[ + ["original_description", "low_energy_proportion"]] + model_data = model_data.merge( wall_u_values, how="left", @@ -149,7 +156,7 @@ class SalModel: model_data["idx"] = model_data.index.copy() # Append on u-values - model_data = self._append_extracted_u_values(model_data) + model_data = self._append_cleaned_data(model_data) # Convert transaction_type model_data = self._convert_transaction_type(model_data) @@ -264,11 +271,3 @@ self = SalModel( data=all_data["data"], cleaner=all_data["cleaner"] ) - -descs = [] -for x in all_data["data"]: - descs.append(x["lighting-description"]) - -descs = list(set(descs)) - -df = pd.DataFrame(all_data['data']) diff --git a/model_data/epc_attributes/LightingAttributes.py b/model_data/epc_attributes/LightingAttributes.py index 40123036..f0dd6ef4 100644 --- a/model_data/epc_attributes/LightingAttributes.py +++ b/model_data/epc_attributes/LightingAttributes.py @@ -1,11 +1,16 @@ import re from model_data.epc_attributes.attribute_utils import clean_description +from model_data.utils import correct_spelling class LightingAttributes: + def __init__(self, description, averages): self.description: str = clean_description(description.lower()) + self.description = correct_spelling(self.description) self.averages = averages + # Correct spelling mistakes in averages + # self.averages["lighting-description"] = self.averages["lighting-description"].apply(correct_spelling) def process(self): diff --git a/model_data/requirements.txt b/model_data/requirements.txt index 126c63ed..ff4d3dda 100644 --- a/model_data/requirements.txt +++ b/model_data/requirements.txt @@ -16,4 +16,6 @@ geopandas mip seaborn statsmodels -scikit-learn \ No newline at end of file +scikit-learn +pyspellchecker +textblob \ No newline at end of file diff --git a/model_data/tests/test_utils.py b/model_data/tests/test_utils.py index ea8d0fd0..1bd8c2f0 100644 --- a/model_data/tests/test_utils.py +++ b/model_data/tests/test_utils.py @@ -1,7 +1,7 @@ import logging from io import StringIO from unittest.mock import patch -from model_data.utils import setup_logger +from model_data.utils import setup_logger, is_percentage_or_number, correct_spelling class TestLogger: @@ -47,3 +47,20 @@ class TestLogger: assert log_stream.read() == "Hello World!\n" # remove the handler after use logger.removeHandler(handler) + + def test_is_percentage_or_number(self): + assert is_percentage_or_number("88") + assert is_percentage_or_number("88%") + assert not is_percentage_or_number("abc") + assert not is_percentage_or_number("") + assert not is_percentage_or_number("88.0") # only integer numbers or percentages + assert is_percentage_or_number("101%") # numbers over 100 allowed + assert not is_percentage_or_number("-1") # negative numbers not allowed + + def test_correct_spelling(self): + assert correct_spelling("speling") == "spelling" + assert correct_spelling("88") == "88" # numbers are left unchanged + assert correct_spelling("corerct") == "correct" + assert correct_spelling("excllent") == "excellent" + assert correct_spelling("") == "" # empty string should return an empty string + assert correct_spelling("88%") == "88%" # percentages are left unchanged diff --git a/model_data/utils.py b/model_data/utils.py index d643f36a..28f8da41 100644 --- a/model_data/utils.py +++ b/model_data/utils.py @@ -1,4 +1,9 @@ import logging +import re +from textblob import TextBlob + +# Pre-compile the regular expression +PERCENTAGE_PATTERN = re.compile(r'^\d+%?$') def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False): @@ -33,3 +38,24 @@ def setup_logger(log_file=None, level=logging.INFO, overwrite_handler=False): logger.addHandler(console_handler) return logger + + +def is_percentage_or_number(s): + # re.match returns None if the string does not match the pattern + return PERCENTAGE_PATTERN.match(s) is not None + + +def correct_spelling(text): + words = text.split() + + corrected_words = [] + for word in words: + if is_percentage_or_number(word): + corrected_words.append(word) + else: + blob = TextBlob(word) # create a TextBlob object + corrected_word = blob.correct() # use the correct method to correct spelling + corrected_words.append(str(corrected_word)) # convert corrected word back to string + + corrected_text = ' '.join(corrected_words) + return corrected_text