From a3c2ff06a8a2cf4317fc8e89285756fa5a49d398 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 25 Jul 2024 15:55:13 +0100 Subject: [PATCH] retrieved all epc fields --- etl/xml_survey_extraction/XmlParser.py | 74 ++++++++++++++++++++++++-- etl/xml_survey_extraction/app.py | 8 +++ 2 files changed, 78 insertions(+), 4 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index 0d9dc512..1533d4c7 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -1,4 +1,5 @@ import re +import numpy as np import usaddress from datetime import datetime from xml.dom.minidom import parseString @@ -43,6 +44,7 @@ def get_house_number(address: str) -> str | None: class XmlParser: + epc = None uprn = None # heating/emissions information @@ -56,6 +58,7 @@ class XmlParser: number_of_doors = None number_of_insulated_doors = None + windows = None # Property dimensions number_of_floors = None @@ -153,7 +156,7 @@ class XmlParser: self.get_heating_and_emissions_data() - self.get_detailed_heating_specs() + # self.get_detailed_heating_specs() # Building fabric self.get_doors() @@ -161,11 +164,21 @@ class XmlParser: # Property dimensions self.get_property_dimensions() + self.get_floor_dimensions() + + self.get_windows() + # Get all of the EPC data self.extract_epc() def extract_epc(self): + if self.floor_dimensions is None: + raise ValueError("Run get_floor_dimensions() first") + + if self.windows is None: + raise ValueError("Run get_windows() first") + property_type = self.get_property_type() if property_type == "Flat": @@ -178,6 +191,15 @@ class XmlParser: flat_storey_count = "" flat_top_storey = "" floor_level = "NO DATA!" + energy_tariff = "NO DATA!" + + floor_height = np.mean([ + float(x['room_height']) for x in self.floor_dimensions if x['building_part_identifier'] == 'Main Dwelling' + ]) + + # Take the most prevelant glazing type + glazed_type = [w["glazing_type"] for w in self.windows if w['window_location'] == '0'] + glazed_type = max(glazed_type, key=glazed_type.count) self.epc = { "uprn": self.uprn, @@ -286,7 +308,7 @@ class XmlParser: "tenure": self.TENURE_MAP[self.get_node_value('Tenure')], "floor-level": floor_level, "potential-energy-efficiency": self.get_energy_assessment_value('Energy-Rating-Potential'), - "potentual-energy-rating": sap_to_epc(float(self.get_energy_assessment_value('Energy-Rating-Potential'))), + "potential-energy-rating": sap_to_epc(float(self.get_energy_assessment_value('Energy-Rating-Potential'))), "hot-water-energy-eff": self.RATINGS_MAP[ self.get_property_summary_value('Hot-Water', 'Energy-Efficiency-Rating') ], @@ -304,7 +326,9 @@ class XmlParser: "lodgement-datetime": datetime.strptime(self.get_node_value('Inspection-Date'), "%Y-%m-%d").isoformat(), "mainheat-description": self.get_property_summary_value('Main-Heating', 'Description'), - + "floor-height": floor_height, + "glazed-type": glazed_type, + "energy-tariff": energy_tariff, } def get_node_value(self, tag_name): @@ -405,7 +429,7 @@ class XmlParser: .getElementsByTagName("Main-Heating")[0] ) - heating_code = sap_main_heating_details.getElementsByTagName("SAP-Main-Heating-Code")[0].firstChild.nodeValue + heating_code = sap_main_heating_details.getElementsByTagName("Main-Heating-Number")[0].firstChild.nodeValue # Get the heating system heating_system = heating_data[heating_data["code"] == int(heating_code)]["description"] @@ -579,3 +603,45 @@ class XmlParser: floor_dimensions.extend(data) self.floor_dimensions = floor_dimensions + + def get_windows(self): + """ + Extracts data about the windows in the property, including the number of windows and the window type. + :return: + """ + + sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") + + # This is the data in each sap window: + # 2 + # 1.55 + # 1 + # 3 + # true + # 16+ + # 7 + + glazing_type_lookup = { + "3": "double glazing, unknown install date" + } + + orientation_lookup = { + "3": "East", + "5": "South", + "1": "North", + "7": "West", + } + + self.windows = [ + { + "window_location": window.getElementsByTagName("Window-Location")[0].firstChild.nodeValue, + "window_area": window.getElementsByTagName("Window-Area")[0].firstChild.nodeValue, + "window_type": window.getElementsByTagName("Window-Type")[0].firstChild.nodeValue, + "glazing_type": glazing_type_lookup[ + window.getElementsByTagName("Glazing-Type")[0].firstChild.nodeValue + ], + "pvc_frame": window.getElementsByTagName("PVC-Frame")[0].firstChild.nodeValue, + "glazing_gap": window.getElementsByTagName("Glazing-Gap")[0].firstChild.nodeValue, + "orientation": orientation_lookup[window.getElementsByTagName("Orientation")[0].firstChild.nodeValue] + } for window in sap_windows + ] diff --git a/etl/xml_survey_extraction/app.py b/etl/xml_survey_extraction/app.py index c70097d4..c32bd787 100644 --- a/etl/xml_survey_extraction/app.py +++ b/etl/xml_survey_extraction/app.py @@ -1,3 +1,5 @@ +import pandas as pd + from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder from utils.logger import setup_logger from etl.xml_survey_extraction.XmlParser import XmlParser @@ -51,3 +53,9 @@ def main(): logger.info(f"Extracted data from {xml}") # TODO: Set a portfolio ID, Target and Automatically upload the asset list and create the event for the portfolio + + # TODO: In order to get the full data associated to the heating system, we need to download and parse the pcdb which + # can be found here: https://www.ncm-pcdb.org.uk/pcdb/pcdb10.dat + # https://www.ncm-pcdb.org.uk/sap/download + # However retrieving this data is not a priority, so we can leave this for now as parsing the database + # is a non-trivial task