diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index 1533d4c7..53f7e859 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -45,6 +45,7 @@ def get_house_number(address: str) -> str | None: class XmlParser: epc = None + additional_data = None uprn = None # heating/emissions information @@ -66,20 +67,11 @@ class XmlParser: heat_loss_perimeter = None party_wall_length = None total_floor_area = None - ground_floor_area = None - is_there_party_wall = None floor_height = None insulation_wall_area = None floor_dimensions = None - rrn = None - - database_data = None - - # We assume that the insulation wall area is 85% of the total wall area, as a standard estimate - INSULATION_WALL_AREA_FACTOR = 0.85 - # The value of the URPN tells us about the file type that we're parsing UPRN_FILETYPE_MAP = { 0: "EPR", @@ -119,6 +111,10 @@ class XmlParser: '1': "Owner-occupied" } + TARIFF_MAP = { + "2": "Single" + } + def __init__(self, file, filekey, uprn=None): file.seek(0) # Ensure the file pointer is at the beginning xml_string = file.read().decode('utf-8') @@ -161,9 +157,6 @@ class XmlParser: # Building fabric self.get_doors() - # Property dimensions - self.get_property_dimensions() - self.get_floor_dimensions() self.get_windows() @@ -171,6 +164,9 @@ class XmlParser: # Get all of the EPC data self.extract_epc() + # Put together all of the additional data we capture + self.extract_additional_data() + def extract_epc(self): if self.floor_dimensions is None: @@ -191,16 +187,23 @@ class XmlParser: flat_storey_count = "" flat_top_storey = "" floor_level = "NO DATA!" - energy_tariff = "NO DATA!" floor_height = np.mean([ - float(x['room_height']) for x in self.floor_dimensions if x['building_part_identifier'] == 'Main Dwelling' + float(x['room_height']) for x in self.floor_dimensions if + x['building_part_identifier'] == 'Main Dwelling' and not x['room_roof'] ]) # Take the most prevelant glazing type glazed_type = [w["glazing_type"] for w in self.windows if w['window_location'] == '0'] glazed_type = max(glazed_type, key=glazed_type.count) + energy_tariff = ( + self.xml.getElementsByTagName("SAP-Energy-Source")[0] + .getElementsByTagName("Meter-Type")[0] + .firstChild.nodeValue + ) + energy_tariff = self.TARIFF_MAP[energy_tariff] + self.epc = { "uprn": self.uprn, "uprn-source": "Address Matched", @@ -209,8 +212,6 @@ class XmlParser: **self.get_sap(), **self.get_property_address(), "low-energy-fixed-light-count": self.get_node_value('Low-Energy-Fixed-Lighting-Outlets-Count'), - # TODO: Needs to be done more carefully - # "floor-height" = self.get_node_value_from_floor_dimensions('Room-Height'), "construction-age-band": self.get_node_value('Construction-Age-Band'), "mainheat-energy-eff": self.RATINGS_MAP[ self.get_property_summary_value('Main-Heating', 'Energy-Efficiency-Rating') @@ -222,8 +223,6 @@ class XmlParser: self.get_property_summary_value('Lighting', 'Energy-Efficiency-Rating') ], "environment-impact-potential": self.get_energy_assessment_value('Environmental-Impact-Potential'), - # TODO: Needs to be done more careully since we have multiple windows - # "glazed-type": self.get_node_value('Glazing-Type'), "mainheatcont-description": self.get_property_summary_value('Main-Heating-Controls', 'Description'), "sheating-energy-eff": self.RATINGS_MAP[ @@ -232,8 +231,7 @@ class XmlParser: "local-authority": "", # Not included in the xml "local-authority-label": "", "fixed-lighting-outlets-count": self.get_node_value('Fixed-Lighting-Outlets-Count'), - # TODO: Doesn't seem to be included in the xml - # "energy-tariff": self.get_node_value('Energy-Tariff'), + "energy-tariff": energy_tariff, "mechanical-ventilation": self.MECHANICAL_VENTILATION_MAP[self.get_node_value('Mechanical-Ventilation')], "solar-water-heating-flag": self.get_node_value('Solar-Water-Heating'), "co2-emissions-potential": self.get_energy_assessment_value('CO2-Emissions-Potential'), @@ -328,7 +326,47 @@ class XmlParser: "mainheat-description": self.get_property_summary_value('Main-Heating', 'Description'), "floor-height": floor_height, "glazed-type": glazed_type, - "energy-tariff": energy_tariff, + } + + def get_insulation_wall_area(self): + """ + Extracts the insulation wall area for the main dwelling + :return: + """ + + main_dwelling_floors = [ + f for f in self.floor_dimensions if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"] + ] + main_dwelling_windows = [ + w for w in self.windows if w["window_location"] == "0" + ] + + wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors]) + window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows]) + return wall_areas - window_areas + + def extract_additional_data(self): + + self.insulation_wall_area = self.get_insulation_wall_area() + + self.additional_data = { + "file_location": self.filekey, + "surveyor_name": self.surveyor_name, + "space_heating_kwh": self.space_heating_kwh, + "water_heating_kwh": self.water_heating_kwh, + # "heating_system": self.heating_system, + # "heating_controls": self.heating_controls, + "number_of_doors": self.number_of_doors, + "number_of_insulated_doors": self.number_of_insulated_doors, + "number_of_floors": self.number_of_floors, + "insulation_wall_area": self.insulation_wall_area, + "heat_loss_perimeter": self.heat_loss_perimeter, + "party_wall_length": self.party_wall_length, + "perimeter": self.perimeter, + "rooms_with_bath_and_or_shower": self.get_node_value('Rooms-With-Bath-And-Or-Shower'), + "rooms_with_mixer_shower_no_bath": self.get_node_value('Rooms-With-Mixer-Shower-No-Bath'), + "room_with_bath_and_mixer_shower": self.get_node_value('Rooms-With-Bath-And-Mixer-Shower'), + "percent_draftproofed": self.get_node_value('Percent-Draughtproofed'), } def get_node_value(self, tag_name): @@ -516,56 +554,6 @@ class XmlParser: "constituency-label": constituency_label } - def get_property_dimensions(self): - """ - This function will extract the relevant property dimensions including the floor area, - number of floors, perimeter, party wall length and the insulation_wall_area. - - insulation_wall_area is typically simplified down to perimeter * height * 0.85 - :return: - """ - - # Each floor has its own SAP-Floor-Dimension tag - floor_dimensions = ( - self.xml.getElementsByTagName("SAP-Floor-Dimensions")[0] - .getElementsByTagName("SAP-Floor-Dimension") - ) - - self.number_of_floors = len(floor_dimensions) - - self.heat_loss_perimeter = float( - floor_dimensions[0].getElementsByTagName("Heat-Loss-Perimeter")[0].firstChild.nodeValue - ) - - self.party_wall_length = float( - floor_dimensions[0].getElementsByTagName("Party-Wall-Length")[0].firstChild.nodeValue - ) - - party_wall_construction_tag = ( - self.xml.getElementsByTagName("Party-Wall-Construction")[0].firstChild.nodeValue.replace("\n", "").strip() - ) - - self.is_there_party_wall = ( - "Yes" if (self.party_wall_length > 0) or (party_wall_construction_tag != "") else "No" - ) - - # We pull out all of the floor areas - floor_areas = [ - float(x.getElementsByTagName("Total-Floor-Area")[0].firstChild.nodeValue) for x in floor_dimensions - ] - - self.total_floor_area = sum(floor_areas) - self.ground_floor_area = floor_areas[0] - - self.floor_height = float( - floor_dimensions[0] - .getElementsByTagName("Room-Height")[0] - .firstChild.nodeValue - ) - - self.insulation_wall_area = self.heat_loss_perimeter * self.floor_height * self.INSULATION_WALL_AREA_FACTOR - self.perimeter = self.heat_loss_perimeter + self.party_wall_length - def get_floor_dimensions(self): """ @@ -594,16 +582,53 @@ class XmlParser: 'floor': get_part_value(floor_dimension, 'Floor'), 'floor_construction': get_part_value(floor_dimension, 'Floor-Construction'), 'floor_insulation': get_part_value(floor_dimension, 'Floor-Insulation'), - 'heat_loss-perimeter': get_part_value(floor_dimension, 'Heat-Loss-Perimeter'), - 'party_wall-length': get_part_value(floor_dimension, 'Party-Wall-Length'), - 'total_floor-area': get_part_value(floor_dimension, 'Total-Floor-Area'), - 'room_height': get_part_value(floor_dimension, 'Room-Height') + 'heat_loss_perimeter': get_part_value(floor_dimension, 'Heat-Loss-Perimeter'), + 'party_wall_length': get_part_value(floor_dimension, 'Party-Wall-Length'), + 'total_floor_area': get_part_value(floor_dimension, 'Total-Floor-Area'), + 'room_height': get_part_value(floor_dimension, 'Room-Height'), + "room_roof": False } for floor_dimension in sap_floor_dimensions ] + + room_roofs = building_part.getElementsByTagName("SAP-Room-In-Roof") + room_roof_data = [ + { + "building_part_identifier": building_part_identifier, + "floor": str(max([int(d["floor"]) for d in data]) + 1), + "floor_construction": "", + "floor_insulation": rr.getElementsByTagName("Insulation")[0].firstChild.nodeValue, + "heat_loss_perimeter": "", + "party_wall_length": "", + "total_floor_area": rr.getElementsByTagName("Floor-Area")[0].firstChild.nodeValue, + "room_height": "", + "room_roof": True + } for rr in room_roofs + ] + floor_dimensions.extend(data) + floor_dimensions.extend(room_roof_data) self.floor_dimensions = floor_dimensions + self.number_of_floors = len( + [f for f in self.floor_dimensions if f["building_part_identifier"] == "Main Dwelling"] + ) + self.heat_loss_perimeter = max( + [ + float(f["heat_loss_perimeter"]) for f in self.floor_dimensions + if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"] + ] + ) + + self.party_wall_length = max( + [ + float(f["party_wall_length"]) for f in self.floor_dimensions + if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"] + ] + ) + + self.perimeter = self.heat_loss_perimeter + self.party_wall_length + def get_windows(self): """ Extracts data about the windows in the property, including the number of windows and the window type. @@ -612,15 +637,6 @@ class XmlParser: sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") - # This is the data in each sap window: - # 2 - # 1.55 - # 1 - # 3 - # true - # 16+ - # 7 - glazing_type_lookup = { "3": "double glazing, unknown install date" } diff --git a/etl/xml_survey_extraction/app.py b/etl/xml_survey_extraction/app.py index c32bd787..b3500e71 100644 --- a/etl/xml_survey_extraction/app.py +++ b/etl/xml_survey_extraction/app.py @@ -48,7 +48,7 @@ def main(): for xml in xmls: xml_data = read_from_s3(bucket_name=BUCKET, s3_file_name=xml) xml_data_io = BytesIO(xml_data) - xml_parser = XmlParser(file=xml_data_io, filekey=xml, uprn=uprn) + xml_parser = XmlParser(file=xml_data_io, filekey=os.path.join(f"s3://{BUCKET}", xml), uprn=uprn) xml_parser.run() logger.info(f"Extracted data from {xml}")