From 32cdd70b71157aedcfc4d6ab8568b7acb3c38ddb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 4 Sep 2024 11:33:48 +0100 Subject: [PATCH] debugged get_property_type from xml extractor --- backend/app/energy_assessments/router.py | 58 +++++++++++++++++++++--- etl/xml_survey_extraction/XmlParser.py | 51 ++++++++++++++++----- 2 files changed, 91 insertions(+), 18 deletions(-) diff --git a/backend/app/energy_assessments/router.py b/backend/app/energy_assessments/router.py index 21c4e4c1..1e4b44f0 100644 --- a/backend/app/energy_assessments/router.py +++ b/backend/app/energy_assessments/router.py @@ -103,14 +103,60 @@ async def upload(body: EnergyAssessmentUploadPayload): uprn = int(assessment.rstrip("/").split("/")[-1]) assessments_map[uprn] = { "xmls": uploaded_xmls, - "eprs": eprs, - "condition_reports": condition_reports, - "evidence_reports": evidence_reports, - "summary_reports": summary_reports, - "floor_plans": floor_plans + "EPR": eprs, + "Condition Report": condition_reports, + "Evidence Report": evidence_reports, + "Summary Information": summary_reports, + "Floor PLan": floor_plans } - logger.info(f"Exatracted XMLS for the energy assessments") + logger.info("Extracted energy assessment data and storing file locations to database") + xml_data_to_store = [] + energy_assessment_documents = [] + for uprn, files in assessments_map.items(): + + # Create the rows of data to insert into the energy assessment documents + property_ea_docs = [] + for doc_type, doc_files in files.items(): + if doc_type == "xmls": + continue + property_ea_docs.append( + { + "uprn": uprn, + "document_type": doc_type, + "document_location": doc_files + } + ) + energy_assessment_documents.extend(property_ea_docs) + + xmls = files["xmls"] + extracted_data = {} + for xml in xmls: + xml_data = read_from_s3(bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, s3_file_name=xml) + xml_data_io = BytesIO(xml_data) + xml_parser = XmlParser( + file=xml_data_io, + filekey=os.path.join(f"s3://{get_settings().ENERGY_ASSESSMENTS_BUCKET}", xml), + uprn=uprn, + surveyor_company=body.surveyor, + ) + xml_parser.run() + if xml_parser.is_lig: + logger.info(f"Extracted data from {xml}") + extracted_epc = xml_parser.epc + extracted_additional_data = xml_parser.additional_data + + data_to_update = { + **extracted_epc, **extracted_additional_data + } + + # We need to update the keys to match the database schema - i.e. we should replace all hyphens with + # underscores + data_to_update = {k.replace("-", "_"): v for k, v in data_to_update.items()} + + extracted_data.update(data_to_update) + + xml_data_to_store.append(extracted_data) except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index 0bc3d56b..c39e8f95 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -9,6 +9,7 @@ from etl.xml_survey_extraction.pcdb import heating_data PROPERTY_TYPE_LOOKUP = { "0": "House", "House": "House", + "2": "Flat" } @@ -471,6 +472,13 @@ class XmlParser: if not property_type: property_type = self.xml.getElementsByTagName('PropertyType1') + if len(property_type) > 1: + property_types = {PROPERTY_TYPE_LOOKUP[p.firstChild.nodeValue] for p in property_type} + if len(property_types) > 1: + raise ValueError("Multiple property types found") + + return property_types.pop() + return PROPERTY_TYPE_LOOKUP[property_type[0].firstChild.nodeValue] def get_sap(self): @@ -683,6 +691,30 @@ class XmlParser: self.perimeter = self.heat_loss_perimeter + self.party_wall_length + @staticmethod + def _parse_windows_content(window, glazing_type_lookup, orientation_lookup): + + # There may not be a pvc frame + pvc_frame = window.getElementsByTagName("PVC-Frame") + pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None + + # There may not be a glazing gap for single glazed windows + glazing_gap = window.getElementsByTagName("Glazing-Gap") + glazing_gap = glazing_gap[0].firstChild.nodeValue if glazing_gap else None + + parsed = { + "window_location": window.getElementsByTagName("Window-Location")[0].firstChild.nodeValue, + "window_area": window.getElementsByTagName("Window-Area")[0].firstChild.nodeValue, + "window_type": window.getElementsByTagName("Window-Type")[0].firstChild.nodeValue, + "glazing_type": glazing_type_lookup[ + window.getElementsByTagName("Glazing-Type")[0].firstChild.nodeValue + ], + "pvc_frame": pvc_frame, + "glazing_gap": glazing_gap, + "orientation": orientation_lookup[window.getElementsByTagName("Orientation")[0].firstChild.nodeValue] + } + return parsed + def get_windows(self): """ Extracts data about the windows in the property, including the number of windows and the window type. @@ -692,7 +724,8 @@ class XmlParser: sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") glazing_type_lookup = { - "3": "double glazing, unknown install date" + "3": "double glazing, unknown install date", + "5": "Single glazing", } orientation_lookup = { @@ -707,15 +740,9 @@ class XmlParser: } self.windows = [ - { - "window_location": window.getElementsByTagName("Window-Location")[0].firstChild.nodeValue, - "window_area": window.getElementsByTagName("Window-Area")[0].firstChild.nodeValue, - "window_type": window.getElementsByTagName("Window-Type")[0].firstChild.nodeValue, - "glazing_type": glazing_type_lookup[ - window.getElementsByTagName("Glazing-Type")[0].firstChild.nodeValue - ], - "pvc_frame": window.getElementsByTagName("PVC-Frame")[0].firstChild.nodeValue, - "glazing_gap": window.getElementsByTagName("Glazing-Gap")[0].firstChild.nodeValue, - "orientation": orientation_lookup[window.getElementsByTagName("Orientation")[0].firstChild.nodeValue] - } for window in sap_windows + self._parse_windows_content( + window=window, + glazing_type_lookup=glazing_type_lookup, + orientation_lookup=orientation_lookup + ) for window in sap_windows ]