diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index 1533d4c7..53f7e859 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -45,6 +45,7 @@ def get_house_number(address: str) -> str | None:
class XmlParser:
epc = None
+ additional_data = None
uprn = None
# heating/emissions information
@@ -66,20 +67,11 @@ class XmlParser:
heat_loss_perimeter = None
party_wall_length = None
total_floor_area = None
- ground_floor_area = None
- is_there_party_wall = None
floor_height = None
insulation_wall_area = None
floor_dimensions = None
- rrn = None
-
- database_data = None
-
- # We assume that the insulation wall area is 85% of the total wall area, as a standard estimate
- INSULATION_WALL_AREA_FACTOR = 0.85
-
# The value of the URPN tells us about the file type that we're parsing
UPRN_FILETYPE_MAP = {
0: "EPR",
@@ -119,6 +111,10 @@ class XmlParser:
'1': "Owner-occupied"
}
+ TARIFF_MAP = {
+ "2": "Single"
+ }
+
def __init__(self, file, filekey, uprn=None):
file.seek(0) # Ensure the file pointer is at the beginning
xml_string = file.read().decode('utf-8')
@@ -161,9 +157,6 @@ class XmlParser:
# Building fabric
self.get_doors()
- # Property dimensions
- self.get_property_dimensions()
-
self.get_floor_dimensions()
self.get_windows()
@@ -171,6 +164,9 @@ class XmlParser:
# Get all of the EPC data
self.extract_epc()
+ # Put together all of the additional data we capture
+ self.extract_additional_data()
+
def extract_epc(self):
if self.floor_dimensions is None:
@@ -191,16 +187,23 @@ class XmlParser:
flat_storey_count = ""
flat_top_storey = ""
floor_level = "NO DATA!"
- energy_tariff = "NO DATA!"
floor_height = np.mean([
- float(x['room_height']) for x in self.floor_dimensions if x['building_part_identifier'] == 'Main Dwelling'
+ float(x['room_height']) for x in self.floor_dimensions if
+ x['building_part_identifier'] == 'Main Dwelling' and not x['room_roof']
])
# Take the most prevelant glazing type
glazed_type = [w["glazing_type"] for w in self.windows if w['window_location'] == '0']
glazed_type = max(glazed_type, key=glazed_type.count)
+ energy_tariff = (
+ self.xml.getElementsByTagName("SAP-Energy-Source")[0]
+ .getElementsByTagName("Meter-Type")[0]
+ .firstChild.nodeValue
+ )
+ energy_tariff = self.TARIFF_MAP[energy_tariff]
+
self.epc = {
"uprn": self.uprn,
"uprn-source": "Address Matched",
@@ -209,8 +212,6 @@ class XmlParser:
**self.get_sap(),
**self.get_property_address(),
"low-energy-fixed-light-count": self.get_node_value('Low-Energy-Fixed-Lighting-Outlets-Count'),
- # TODO: Needs to be done more carefully
- # "floor-height" = self.get_node_value_from_floor_dimensions('Room-Height'),
"construction-age-band": self.get_node_value('Construction-Age-Band'),
"mainheat-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Main-Heating', 'Energy-Efficiency-Rating')
@@ -222,8 +223,6 @@ class XmlParser:
self.get_property_summary_value('Lighting', 'Energy-Efficiency-Rating')
],
"environment-impact-potential": self.get_energy_assessment_value('Environmental-Impact-Potential'),
- # TODO: Needs to be done more careully since we have multiple windows
- # "glazed-type": self.get_node_value('Glazing-Type'),
"mainheatcont-description":
self.get_property_summary_value('Main-Heating-Controls', 'Description'),
"sheating-energy-eff": self.RATINGS_MAP[
@@ -232,8 +231,7 @@ class XmlParser:
"local-authority": "", # Not included in the xml
"local-authority-label": "",
"fixed-lighting-outlets-count": self.get_node_value('Fixed-Lighting-Outlets-Count'),
- # TODO: Doesn't seem to be included in the xml
- # "energy-tariff": self.get_node_value('Energy-Tariff'),
+ "energy-tariff": energy_tariff,
"mechanical-ventilation": self.MECHANICAL_VENTILATION_MAP[self.get_node_value('Mechanical-Ventilation')],
"solar-water-heating-flag": self.get_node_value('Solar-Water-Heating'),
"co2-emissions-potential": self.get_energy_assessment_value('CO2-Emissions-Potential'),
@@ -328,7 +326,47 @@ class XmlParser:
"mainheat-description": self.get_property_summary_value('Main-Heating', 'Description'),
"floor-height": floor_height,
"glazed-type": glazed_type,
- "energy-tariff": energy_tariff,
+ }
+
+ def get_insulation_wall_area(self):
+ """
+ Extracts the insulation wall area for the main dwelling
+ :return:
+ """
+
+ main_dwelling_floors = [
+ f for f in self.floor_dimensions if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"]
+ ]
+ main_dwelling_windows = [
+ w for w in self.windows if w["window_location"] == "0"
+ ]
+
+ wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors])
+ window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows])
+ return wall_areas - window_areas
+
+ def extract_additional_data(self):
+
+ self.insulation_wall_area = self.get_insulation_wall_area()
+
+ self.additional_data = {
+ "file_location": self.filekey,
+ "surveyor_name": self.surveyor_name,
+ "space_heating_kwh": self.space_heating_kwh,
+ "water_heating_kwh": self.water_heating_kwh,
+ # "heating_system": self.heating_system,
+ # "heating_controls": self.heating_controls,
+ "number_of_doors": self.number_of_doors,
+ "number_of_insulated_doors": self.number_of_insulated_doors,
+ "number_of_floors": self.number_of_floors,
+ "insulation_wall_area": self.insulation_wall_area,
+ "heat_loss_perimeter": self.heat_loss_perimeter,
+ "party_wall_length": self.party_wall_length,
+ "perimeter": self.perimeter,
+ "rooms_with_bath_and_or_shower": self.get_node_value('Rooms-With-Bath-And-Or-Shower'),
+ "rooms_with_mixer_shower_no_bath": self.get_node_value('Rooms-With-Mixer-Shower-No-Bath'),
+ "room_with_bath_and_mixer_shower": self.get_node_value('Rooms-With-Bath-And-Mixer-Shower'),
+ "percent_draftproofed": self.get_node_value('Percent-Draughtproofed'),
}
def get_node_value(self, tag_name):
@@ -516,56 +554,6 @@ class XmlParser:
"constituency-label": constituency_label
}
- def get_property_dimensions(self):
- """
- This function will extract the relevant property dimensions including the floor area,
- number of floors, perimeter, party wall length and the insulation_wall_area.
-
- insulation_wall_area is typically simplified down to perimeter * height * 0.85
- :return:
- """
-
- # Each floor has its own SAP-Floor-Dimension tag
- floor_dimensions = (
- self.xml.getElementsByTagName("SAP-Floor-Dimensions")[0]
- .getElementsByTagName("SAP-Floor-Dimension")
- )
-
- self.number_of_floors = len(floor_dimensions)
-
- self.heat_loss_perimeter = float(
- floor_dimensions[0].getElementsByTagName("Heat-Loss-Perimeter")[0].firstChild.nodeValue
- )
-
- self.party_wall_length = float(
- floor_dimensions[0].getElementsByTagName("Party-Wall-Length")[0].firstChild.nodeValue
- )
-
- party_wall_construction_tag = (
- self.xml.getElementsByTagName("Party-Wall-Construction")[0].firstChild.nodeValue.replace("\n", "").strip()
- )
-
- self.is_there_party_wall = (
- "Yes" if (self.party_wall_length > 0) or (party_wall_construction_tag != "") else "No"
- )
-
- # We pull out all of the floor areas
- floor_areas = [
- float(x.getElementsByTagName("Total-Floor-Area")[0].firstChild.nodeValue) for x in floor_dimensions
- ]
-
- self.total_floor_area = sum(floor_areas)
- self.ground_floor_area = floor_areas[0]
-
- self.floor_height = float(
- floor_dimensions[0]
- .getElementsByTagName("Room-Height")[0]
- .firstChild.nodeValue
- )
-
- self.insulation_wall_area = self.heat_loss_perimeter * self.floor_height * self.INSULATION_WALL_AREA_FACTOR
- self.perimeter = self.heat_loss_perimeter + self.party_wall_length
-
def get_floor_dimensions(self):
"""
@@ -594,16 +582,53 @@ class XmlParser:
'floor': get_part_value(floor_dimension, 'Floor'),
'floor_construction': get_part_value(floor_dimension, 'Floor-Construction'),
'floor_insulation': get_part_value(floor_dimension, 'Floor-Insulation'),
- 'heat_loss-perimeter': get_part_value(floor_dimension, 'Heat-Loss-Perimeter'),
- 'party_wall-length': get_part_value(floor_dimension, 'Party-Wall-Length'),
- 'total_floor-area': get_part_value(floor_dimension, 'Total-Floor-Area'),
- 'room_height': get_part_value(floor_dimension, 'Room-Height')
+ 'heat_loss_perimeter': get_part_value(floor_dimension, 'Heat-Loss-Perimeter'),
+ 'party_wall_length': get_part_value(floor_dimension, 'Party-Wall-Length'),
+ 'total_floor_area': get_part_value(floor_dimension, 'Total-Floor-Area'),
+ 'room_height': get_part_value(floor_dimension, 'Room-Height'),
+ "room_roof": False
} for floor_dimension in sap_floor_dimensions
]
+
+ room_roofs = building_part.getElementsByTagName("SAP-Room-In-Roof")
+ room_roof_data = [
+ {
+ "building_part_identifier": building_part_identifier,
+ "floor": str(max([int(d["floor"]) for d in data]) + 1),
+ "floor_construction": "",
+ "floor_insulation": rr.getElementsByTagName("Insulation")[0].firstChild.nodeValue,
+ "heat_loss_perimeter": "",
+ "party_wall_length": "",
+ "total_floor_area": rr.getElementsByTagName("Floor-Area")[0].firstChild.nodeValue,
+ "room_height": "",
+ "room_roof": True
+ } for rr in room_roofs
+ ]
+
floor_dimensions.extend(data)
+ floor_dimensions.extend(room_roof_data)
self.floor_dimensions = floor_dimensions
+ self.number_of_floors = len(
+ [f for f in self.floor_dimensions if f["building_part_identifier"] == "Main Dwelling"]
+ )
+ self.heat_loss_perimeter = max(
+ [
+ float(f["heat_loss_perimeter"]) for f in self.floor_dimensions
+ if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"]
+ ]
+ )
+
+ self.party_wall_length = max(
+ [
+ float(f["party_wall_length"]) for f in self.floor_dimensions
+ if f["building_part_identifier"] == "Main Dwelling" and not f["room_roof"]
+ ]
+ )
+
+ self.perimeter = self.heat_loss_perimeter + self.party_wall_length
+
def get_windows(self):
"""
Extracts data about the windows in the property, including the number of windows and the window type.
@@ -612,15 +637,6 @@ class XmlParser:
sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window")
- # This is the data in each sap window:
- # 2
- # 1.55
- # 1
- # 3
- # true
- # 16+
- # 7
-
glazing_type_lookup = {
"3": "double glazing, unknown install date"
}
diff --git a/etl/xml_survey_extraction/app.py b/etl/xml_survey_extraction/app.py
index c32bd787..b3500e71 100644
--- a/etl/xml_survey_extraction/app.py
+++ b/etl/xml_survey_extraction/app.py
@@ -48,7 +48,7 @@ def main():
for xml in xmls:
xml_data = read_from_s3(bucket_name=BUCKET, s3_file_name=xml)
xml_data_io = BytesIO(xml_data)
- xml_parser = XmlParser(file=xml_data_io, filekey=xml, uprn=uprn)
+ xml_parser = XmlParser(file=xml_data_io, filekey=os.path.join(f"s3://{BUCKET}", xml), uprn=uprn)
xml_parser.run()
logger.info(f"Extracted data from {xml}")