diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py new file mode 100644 index 00000000..d235be78 --- /dev/null +++ b/etl/customers/aiha/xml_extraction.py @@ -0,0 +1,60 @@ +import os +from io import BytesIO +from etl.xml_survey_extraction.XmlParser import XmlParser + +SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" + + +def main(): + """ + This script handles the extraction of data from the XML files in the survey folders. + :return: + """ + # Step 1: List all subfolders inside SURVEY_FOLDER_PATH. + subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()] + + # Step 2: Loop through each subfolder and find the XML files. + extracted_surveys = [] + for subfolder in subfolders: + print(f"Searching in subfolder: {subfolder}") + + # Find all XML files in the current subfolder. + xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] + + if not xml_files: + raise FileNotFoundError(f"No XML files found in subfolder: {subfolder}") + + # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. + for xml_file in xml_files: + xml_path = os.path.join(subfolder, xml_file) + print(f"Processing XML file: {xml_path}") + + # Read in the XML and parse it using the XmlParser class. + with open(xml_path, 'rb') as file: + xml_data_io = BytesIO(file.read()) + uprn = None # Set the UPRN if available. + + # Create an XmlParser instance + xml_parser = XmlParser( + file=xml_data_io, + filekey=xml_path, + surveyor_company="", + uprn=uprn, + ) + + # Run the parser to extract the data + xml_parser.run() + + # Store the extracted data for further processing + extracted_surveys.append({ + "epc": xml_parser.epc, + "additional_data": xml_parser.additional_data, + "subfolder": subfolder + }) + + print(f"Extracted {len(extracted_surveys)} surveys.") + # Process the extracted_surveys as needed, for example, save to a database or write to a file. + + +if __name__ == "__main__": + main() diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ffe191a4..ed3d65d2 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -769,8 +769,6 @@ class XmlParser: :return: """ - sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") - glazing_type_lookup = { "3": "double glazing, unknown install date", "5": "Single glazing", @@ -787,6 +785,38 @@ class XmlParser: "8": "North West" } + sap_windows = self.xml.getElementsByTagName("SAP-Windows") + + if not sap_windows: + # We look for Multi-Glazed-Proportion + multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazing-Type" + )[0].firstChild.nodeValue + + pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "PVC-Window-Frames" + )[0].firstChild.nodeValue + + multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazed-Proportion" + )[0].firstChild.nodeValue + + self.windows = [ + { + "window_location": None, + "window_area": None, + "window_type": None, + "glazing_type": glazing_type_lookup[multiple_glazing_type], + "pvc_frame": pvc_frame, + "glazing_gap": None, + "orientation": None, + "multple_glazed_proportion": multple_glazed_proportion + } + ] + return + + sap_windows = sap_windows[0].getElementsByTagName("SAP-Window") + self.windows = [ self._parse_windows_content( window=window,