From 7b04e1edc72a2e255fbc359fbbec3c1c72a37206 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 25 Jul 2024 18:13:50 +0100 Subject: [PATCH] preparing for data extraction --- etl/xml_survey_extraction/XmlParser.py | 27 +++++++++----------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index d14dafc4..76fa5612 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -44,8 +44,8 @@ def get_house_number(address: str) -> str | None: class XmlParser: - epc = None - additional_data = None + epc = {} + additional_data = {} uprn = None # heating/emissions information @@ -72,12 +72,6 @@ class XmlParser: floor_dimensions = None - # The value of the URPN tells us about the file type that we're parsing - UPRN_FILETYPE_MAP = { - 0: "EPR", - -1: "RDSAP_EPR" - } - RATINGS_MAP = { "0": "N/A", "1": "Very Poor", @@ -122,14 +116,11 @@ class XmlParser: self.filekey = filekey self.surveyor_company = surveyor_company - # The xml parser is use to parse the EPC and EPR xmls and different file types will contain different - # information - # In order to identify the file type, we can look for the presence of the 'UPRN' tag - # If the UPRN tag is present, we can assume that the file is an EPC - # If the UPRN tag is not present, we can assume that the file is an EPR - self.get_uprn(uprn) + # We check if we have a lig xml or rdsap xml + # We look for the presence of the Schema-Version-Original tag + self.is_lig = len(self.xml.getElementsByTagName("Schema-Version-Original")) > 0 - self.file_type = self.UPRN_FILETYPE_MAP.get(self.uprn, "EPC") + self.get_uprn(uprn) @staticmethod def get_node(node): @@ -145,10 +136,10 @@ class XmlParser: return node_first_child.nodeValue def run(self): - if self.file_type == "RDSAP_EPR": - # This file type contains just limited information compared to a regular EPR/EPC, and so we just exit - # unless we learn something else that determines that we need information from this file + + if not self.is_lig: return + self.get_assessor_details() self.get_heating_and_emissions_data()