diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index 0d9dc512..1533d4c7 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -1,4 +1,5 @@
import re
+import numpy as np
import usaddress
from datetime import datetime
from xml.dom.minidom import parseString
@@ -43,6 +44,7 @@ def get_house_number(address: str) -> str | None:
class XmlParser:
+ epc = None
uprn = None
# heating/emissions information
@@ -56,6 +58,7 @@ class XmlParser:
number_of_doors = None
number_of_insulated_doors = None
+ windows = None
# Property dimensions
number_of_floors = None
@@ -153,7 +156,7 @@ class XmlParser:
self.get_heating_and_emissions_data()
- self.get_detailed_heating_specs()
+ # self.get_detailed_heating_specs()
# Building fabric
self.get_doors()
@@ -161,11 +164,21 @@ class XmlParser:
# Property dimensions
self.get_property_dimensions()
+ self.get_floor_dimensions()
+
+ self.get_windows()
+
# Get all of the EPC data
self.extract_epc()
def extract_epc(self):
+ if self.floor_dimensions is None:
+ raise ValueError("Run get_floor_dimensions() first")
+
+ if self.windows is None:
+ raise ValueError("Run get_windows() first")
+
property_type = self.get_property_type()
if property_type == "Flat":
@@ -178,6 +191,15 @@ class XmlParser:
flat_storey_count = ""
flat_top_storey = ""
floor_level = "NO DATA!"
+ energy_tariff = "NO DATA!"
+
+ floor_height = np.mean([
+ float(x['room_height']) for x in self.floor_dimensions if x['building_part_identifier'] == 'Main Dwelling'
+ ])
+
+ # Take the most prevelant glazing type
+ glazed_type = [w["glazing_type"] for w in self.windows if w['window_location'] == '0']
+ glazed_type = max(glazed_type, key=glazed_type.count)
self.epc = {
"uprn": self.uprn,
@@ -286,7 +308,7 @@ class XmlParser:
"tenure": self.TENURE_MAP[self.get_node_value('Tenure')],
"floor-level": floor_level,
"potential-energy-efficiency": self.get_energy_assessment_value('Energy-Rating-Potential'),
- "potentual-energy-rating": sap_to_epc(float(self.get_energy_assessment_value('Energy-Rating-Potential'))),
+ "potential-energy-rating": sap_to_epc(float(self.get_energy_assessment_value('Energy-Rating-Potential'))),
"hot-water-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Hot-Water', 'Energy-Efficiency-Rating')
],
@@ -304,7 +326,9 @@ class XmlParser:
"lodgement-datetime":
datetime.strptime(self.get_node_value('Inspection-Date'), "%Y-%m-%d").isoformat(),
"mainheat-description": self.get_property_summary_value('Main-Heating', 'Description'),
-
+ "floor-height": floor_height,
+ "glazed-type": glazed_type,
+ "energy-tariff": energy_tariff,
}
def get_node_value(self, tag_name):
@@ -405,7 +429,7 @@ class XmlParser:
.getElementsByTagName("Main-Heating")[0]
)
- heating_code = sap_main_heating_details.getElementsByTagName("SAP-Main-Heating-Code")[0].firstChild.nodeValue
+ heating_code = sap_main_heating_details.getElementsByTagName("Main-Heating-Number")[0].firstChild.nodeValue
# Get the heating system
heating_system = heating_data[heating_data["code"] == int(heating_code)]["description"]
@@ -579,3 +603,45 @@ class XmlParser:
floor_dimensions.extend(data)
self.floor_dimensions = floor_dimensions
+
+ def get_windows(self):
+ """
+ Extracts data about the windows in the property, including the number of windows and the window type.
+ :return:
+ """
+
+ sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window")
+
+ # This is the data in each sap window:
+ # 2
+ # 1.55
+ # 1
+ # 3
+ # true
+ # 16+
+ # 7
+
+ glazing_type_lookup = {
+ "3": "double glazing, unknown install date"
+ }
+
+ orientation_lookup = {
+ "3": "East",
+ "5": "South",
+ "1": "North",
+ "7": "West",
+ }
+
+ self.windows = [
+ {
+ "window_location": window.getElementsByTagName("Window-Location")[0].firstChild.nodeValue,
+ "window_area": window.getElementsByTagName("Window-Area")[0].firstChild.nodeValue,
+ "window_type": window.getElementsByTagName("Window-Type")[0].firstChild.nodeValue,
+ "glazing_type": glazing_type_lookup[
+ window.getElementsByTagName("Glazing-Type")[0].firstChild.nodeValue
+ ],
+ "pvc_frame": window.getElementsByTagName("PVC-Frame")[0].firstChild.nodeValue,
+ "glazing_gap": window.getElementsByTagName("Glazing-Gap")[0].firstChild.nodeValue,
+ "orientation": orientation_lookup[window.getElementsByTagName("Orientation")[0].firstChild.nodeValue]
+ } for window in sap_windows
+ ]
diff --git a/etl/xml_survey_extraction/app.py b/etl/xml_survey_extraction/app.py
index c70097d4..c32bd787 100644
--- a/etl/xml_survey_extraction/app.py
+++ b/etl/xml_survey_extraction/app.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder
from utils.logger import setup_logger
from etl.xml_survey_extraction.XmlParser import XmlParser
@@ -51,3 +53,9 @@ def main():
logger.info(f"Extracted data from {xml}")
# TODO: Set a portfolio ID, Target and Automatically upload the asset list and create the event for the portfolio
+
+ # TODO: In order to get the full data associated to the heating system, we need to download and parse the pcdb which
+ # can be found here: https://www.ncm-pcdb.org.uk/pcdb/pcdb10.dat
+ # https://www.ncm-pcdb.org.uk/sap/download
+ # However retrieving this data is not a priority, so we can leave this for now as parsing the database
+ # is a non-trivial task