mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added EpcClean class, added mypy to requirements
This commit is contained in:
parent
85cf7cf294
commit
6fc27d4797
3 changed files with 169 additions and 140 deletions
139
epc_data/app.py
139
epc_data/app.py
|
|
@ -34,145 +34,6 @@ def handler():
|
|||
)
|
||||
|
||||
# TODO: Fill this
|
||||
ClEANING_FIELDS = [
|
||||
"roof-description",
|
||||
"floor-description",
|
||||
"walls-description",
|
||||
"mainheat-description"
|
||||
]
|
||||
|
||||
field = "roof-description"
|
||||
unique_vals = Counter([v[field] for v in data])
|
||||
|
||||
def search_description_options(desc):
|
||||
if desc == "insulated":
|
||||
return "average"
|
||||
if desc == "limited":
|
||||
return "below average"
|
||||
raise Exception("Handle me")
|
||||
|
||||
def find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat):
|
||||
|
||||
if "no insulation" in description_lower:
|
||||
return 0
|
||||
|
||||
if is_pitched:
|
||||
try:
|
||||
|
||||
thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip()
|
||||
if "+" in thickness:
|
||||
return thickness
|
||||
return int(thickness)
|
||||
except ValueError as _:
|
||||
if "invalid input" in description_lower:
|
||||
return None
|
||||
desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0]
|
||||
return search_description_options(desc)
|
||||
|
||||
if is_roof_room:
|
||||
desc_split_lookup = {
|
||||
"ceiling insulated": "average",
|
||||
"thatched": "average",
|
||||
}
|
||||
# Just search for specific phrases
|
||||
desc_split = description_lower.split("roof room(s),")[-1].lstrip()
|
||||
res = desc_split_lookup.get(desc_split)
|
||||
if res:
|
||||
return res
|
||||
|
||||
desc = desc_split.split(" ")[0]
|
||||
return search_description_options(desc)
|
||||
|
||||
if is_flat:
|
||||
# Just search for specific phrases
|
||||
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
|
||||
return search_description_options(desc)
|
||||
|
||||
return None
|
||||
|
||||
import re
|
||||
def extract_thermal_transmittence(description_lower):
|
||||
# Find U-value
|
||||
u_value = re.search(r"(\d+\.\d+)", description_lower)
|
||||
if u_value is not None:
|
||||
u_value = float(u_value.group(1))
|
||||
else:
|
||||
u_value = None
|
||||
|
||||
# Find unit
|
||||
unit = re.search(r"(w/m-¦k)", description_lower)
|
||||
if unit is not None:
|
||||
unit = unit.group(1)
|
||||
else:
|
||||
unit = None
|
||||
|
||||
return u_value, unit
|
||||
|
||||
def clean_roof(description):
|
||||
"""
|
||||
We aim to extract features about the roof, so we can characterise it. We will check:
|
||||
- If the roof is pitched
|
||||
- If there is a room roof
|
||||
- if there is a loft
|
||||
- If it has insulation
|
||||
- if so, what degree of insulation
|
||||
-
|
||||
|
||||
:param x:
|
||||
:return:
|
||||
"""
|
||||
description_lower = description.lower().lstrip().rstrip()
|
||||
|
||||
if "another dwelling above" in description_lower or "other premises above" in description_lower:
|
||||
return {
|
||||
"is_pitched": False,
|
||||
"is_roof_room": False,
|
||||
"has_loft": False,
|
||||
"insulation_thickness": 0,
|
||||
"has_dwelling_above": True,
|
||||
"assumed": "assumed" in description_lower,
|
||||
"is_flat": "flat" in description_lower,
|
||||
"is_thatched": False,
|
||||
"thermal_transmittence": None,
|
||||
"thermal_transmittence_unit": None,
|
||||
}
|
||||
|
||||
is_pitched = "pitched" in description_lower
|
||||
is_roof_room = "roof room" in description_lower
|
||||
has_loft = "loft" in description_lower
|
||||
is_flat = "flat" in description_lower
|
||||
is_thatched = "thatched" in description_lower
|
||||
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
if "insulation" in description_lower or "insulated" in description_lower:
|
||||
insulation_thickness = find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
|
||||
elif "thermal transmittance" in description_lower:
|
||||
thermal_transmittence, thermal_transmittence_unit = extract_thermal_transmittence(description_lower)
|
||||
elif is_thatched:
|
||||
# Search for these features:
|
||||
thermal_transmittence, thermal_transmittence_unit = extract_thermal_transmittence(description_lower)
|
||||
insulation_thickness = find_insulation_thickness(
|
||||
description_lower, is_pitched, is_roof_room, is_flat
|
||||
)
|
||||
else:
|
||||
raise Exception("Implment me 2")
|
||||
|
||||
attributes = {
|
||||
"is_pitched": is_pitched,
|
||||
"is_roof_room": is_roof_room,
|
||||
"has_loft": has_loft,
|
||||
"insulation_thickness": insulation_thickness,
|
||||
"has_dwelling_above": False,
|
||||
"assumed": "assumed" in description_lower,
|
||||
"is_flat": is_flat,
|
||||
"thermal_transmittence": thermal_transmittence,
|
||||
"thermal_transmittence_unit": thermal_transmittence_unit
|
||||
}
|
||||
|
||||
return attributes
|
||||
|
||||
cleaned_roof = []
|
||||
for description in unique_vals.keys():
|
||||
cleaned_roof.append(
|
||||
{"original": description, "cleaned": clean_roof(description)}
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
epc-api-python
|
||||
python-dotenv
|
||||
tqdm
|
||||
pandas
|
||||
pandas
|
||||
mypy
|
||||
167
epc_data/tests/EpcClean.py
Normal file
167
epc_data/tests/EpcClean.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
import re
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
|
||||
class EpcClean:
|
||||
"""
|
||||
Container for methods which we utilise for cleaning EPC data
|
||||
"""
|
||||
|
||||
CLEANING_FIELDS = [
|
||||
"roof-description",
|
||||
"floor-description",
|
||||
"walls-description",
|
||||
"mainheat-description"
|
||||
]
|
||||
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
self.unique_vals = {}
|
||||
|
||||
self.cleaned = {}
|
||||
|
||||
def clean(self):
|
||||
"""
|
||||
This method cleans the EPC data, mapping text fields to propety attributes
|
||||
:return:
|
||||
"""
|
||||
self._init_empty_cleaned_obj()
|
||||
|
||||
for field in self.CLEANING_FIELDS:
|
||||
self.unique_vals[field] = Counter([v[field] for v in self.data])
|
||||
|
||||
for description in self.unique_vals["roof-description"].keys():
|
||||
self.cleaned["roof-description"].append(
|
||||
{"original": description, "cleaned": self.clean_roof(description)}
|
||||
)
|
||||
|
||||
def _init_empty_cleaned_obj(self):
|
||||
self.cleaned = defaultdict(list, {k: [] for k in self.CLEANING_FIELDS})
|
||||
|
||||
@staticmethod
|
||||
def search_description_options(desc):
|
||||
if desc == "insulated":
|
||||
return "average"
|
||||
if desc == "limited":
|
||||
return "below average"
|
||||
raise Exception("Handle me")
|
||||
|
||||
def _find_insulation_thickness(self, description_lower, is_pitched, is_roof_room, is_flat):
|
||||
|
||||
if "no insulation" in description_lower:
|
||||
return 0
|
||||
|
||||
if is_pitched:
|
||||
try:
|
||||
|
||||
thickness = description_lower.split("pitched,")[-1].split("mm")[0].lstrip().rstrip()
|
||||
if "+" in thickness:
|
||||
return thickness
|
||||
return int(thickness)
|
||||
except ValueError as _:
|
||||
if "invalid input" in description_lower:
|
||||
return None
|
||||
desc = description_lower.split("pitched,")[-1].lstrip().split(" ")[0]
|
||||
return self._search_description_options(desc)
|
||||
|
||||
if is_roof_room:
|
||||
desc_split_lookup = {
|
||||
"ceiling insulated": "average",
|
||||
"thatched": "average",
|
||||
}
|
||||
# Just search for specific phrases
|
||||
desc_split = description_lower.split("roof room(s),")[-1].lstrip()
|
||||
res = desc_split_lookup.get(desc_split)
|
||||
if res:
|
||||
return res
|
||||
|
||||
desc = desc_split.split(" ")[0]
|
||||
return self._search_description_options(desc)
|
||||
|
||||
if is_flat:
|
||||
# Just search for specific phrases
|
||||
desc = description_lower.split("flat,")[-1].lstrip().split(" ")[0]
|
||||
return self._search_description_options(desc)
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _extract_thermal_transmittence(description_lower):
|
||||
# Find U-value
|
||||
u_value = re.search(r"(\d+\.\d+)", description_lower)
|
||||
if u_value is not None:
|
||||
u_value = float(u_value.group(1))
|
||||
else:
|
||||
u_value = None
|
||||
|
||||
# Find unit
|
||||
unit = re.search(r"(w/m-¦k)", description_lower)
|
||||
if unit is not None:
|
||||
unit = unit.group(1)
|
||||
else:
|
||||
unit = None
|
||||
|
||||
return u_value, unit
|
||||
|
||||
def clean_roof(self, description):
|
||||
"""
|
||||
We aim to extract features about the roof, so we can characterise it. We will check:
|
||||
- If the roof is pitched
|
||||
- If there is a room roof
|
||||
- if there is a loft
|
||||
- If it has insulation
|
||||
- if so, what degree of insulation
|
||||
-
|
||||
|
||||
:param x:
|
||||
:return:
|
||||
"""
|
||||
description_lower = description.lower().lstrip().rstrip()
|
||||
|
||||
if "another dwelling above" in description_lower or "other premises above" in description_lower:
|
||||
return {
|
||||
"is_pitched": False,
|
||||
"is_roof_room": False,
|
||||
"has_loft": False,
|
||||
"insulation_thickness": 0,
|
||||
"has_dwelling_above": True,
|
||||
"assumed": "assumed" in description_lower,
|
||||
"is_flat": "flat" in description_lower,
|
||||
"is_thatched": False,
|
||||
"thermal_transmittence": None,
|
||||
"thermal_transmittence_unit": None,
|
||||
}
|
||||
|
||||
is_pitched = "pitched" in description_lower
|
||||
is_roof_room = "roof room" in description_lower
|
||||
has_loft = "loft" in description_lower
|
||||
is_flat = "flat" in description_lower
|
||||
is_thatched = "thatched" in description_lower
|
||||
|
||||
thermal_transmittence, thermal_transmittence_unit, insulation_thickness = None, None, None
|
||||
if "insulation" in description_lower or "insulated" in description_lower:
|
||||
insulation_thickness = self._find_insulation_thickness(description_lower, is_pitched, is_roof_room, is_flat)
|
||||
elif "thermal transmittance" in description_lower:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
elif is_thatched:
|
||||
# Search for these features:
|
||||
thermal_transmittence, thermal_transmittence_unit = self._extract_thermal_transmittence(description_lower)
|
||||
insulation_thickness = self._find_insulation_thickness(
|
||||
description_lower, is_pitched, is_roof_room, is_flat
|
||||
)
|
||||
else:
|
||||
raise Exception("Implment me 2")
|
||||
|
||||
attributes = {
|
||||
"is_pitched": is_pitched,
|
||||
"is_roof_room": is_roof_room,
|
||||
"has_loft": has_loft,
|
||||
"insulation_thickness": insulation_thickness,
|
||||
"has_dwelling_above": False,
|
||||
"assumed": "assumed" in description_lower,
|
||||
"is_flat": is_flat,
|
||||
"thermal_transmittence": thermal_transmittence,
|
||||
"thermal_transmittence_unit": thermal_transmittence_unit
|
||||
}
|
||||
|
||||
return attributes
|
||||
Loading…
Add table
Reference in a new issue