Model/etl/xml_survey_extraction/XmlParser.py
2024-07-25 14:47:02 +01:00

581 lines
23 KiB
Python

import re
import usaddress
from datetime import datetime
from xml.dom.minidom import parseString
from backend.app.utils import sap_to_epc
from etl.xml_survey_extraction.pcdb import heating_data
PROPERTY_TYPE_LOOKUP = {
"0": "House",
"House": "House",
}
def get_house_number(address: str) -> str | None:
"""
This method will use the usaddress library to parse an address and extract the house number
:return:
"""
parsed = usaddress.parse(address)
parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")]
parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None
if parsed_house_number is None:
# Because usaddress isn't optimal for parsing addresses with some prefixes such as 'Flat',
# we also add a custom approach
# Pattern to look for 'Flat' or 'Apartment' followed by a number, or just a number at the beginning
pattern = r'(?i)(?:flat|apartment)\s*(\d+)|^\s*(\d+)'
match = re.search(pattern, address)
if match:
# Return the first non-None group found
return next(g for g in match.groups() if g is not None)
else:
return None
# Remove training commas
parsed_house_number = parsed_house_number.replace(",", "")
return parsed_house_number
class XmlParser:
uprn = None
# heating/emissions information
space_heating_kwh = None
water_heating_kwh = None
heating_system = None
heating_controls = None
# Assessor details
surveyor_name = None
number_of_doors = None
number_of_insulated_doors = None
# Property dimensions
number_of_floors = None
perimeter = None
heat_loss_perimeter = None
party_wall_length = None
total_floor_area = None
ground_floor_area = None
is_there_party_wall = None
floor_height = None
insulation_wall_area = None
floor_dimensions = None
rrn = None
database_data = None
# We assume that the insulation wall area is 85% of the total wall area, as a standard estimate
INSULATION_WALL_AREA_FACTOR = 0.85
# The value of the URPN tells us about the file type that we're parsing
UPRN_FILETYPE_MAP = {
0: "EPR",
-1: "RDSAP_EPR"
}
RATINGS_MAP = {
"0": "N/A",
"1": "Very Poor",
"2": "Poor",
"3": "Average",
"4": "Good",
"5": "Very Good"
}
MECHANICAL_VENTILATION_MAP = {
"0": "natural"
}
BUILT_FORM_MAP = {
"1": "Detached",
}
GLAZED_AREA_MAP = {
"4": "Much More Than Typical"
}
FUEL_TYPE_MAP = {
"26": "mains gas (not community)"
}
TRANSACTION_TYPE_MAP = {
"13": "ECO assessment"
}
TENURE_MAP = {
'1': "Owner-occupied"
}
def __init__(self, file, filekey, uprn=None):
file.seek(0) # Ensure the file pointer is at the beginning
xml_string = file.read().decode('utf-8')
self.xml = parseString(xml_string)
self.filekey = filekey
# The xml parser is use to parse the EPC and EPR xmls and different file types will contain different
# information
# In order to identify the file type, we can look for the presence of the 'UPRN' tag
# If the UPRN tag is present, we can assume that the file is an EPC
# If the UPRN tag is not present, we can assume that the file is an EPR
self.get_uprn(uprn)
self.file_type = self.UPRN_FILETYPE_MAP.get(self.uprn, "EPC")
@staticmethod
def get_node(node):
"""
Utility function to get the node value from the xml, where data might be optional
:return:
"""
node_first_child = node.firstChild
if node_first_child is None:
return None
return node_first_child.nodeValue
def run(self):
if self.file_type == "RDSAP_EPR":
# This file type contains just limited information compared to a regular EPR/EPC, and so we just exit
# unless we learn something else that determines that we need information from this file
return
self.get_assessor_details()
self.get_heating_and_emissions_data()
self.get_detailed_heating_specs()
# Building fabric
self.get_doors()
# Property dimensions
self.get_property_dimensions()
# Get all of the EPC data
self.extract_epc()
def extract_epc(self):
property_type = self.get_property_type()
if property_type == "Flat":
raise NotImplementedError(
"Need to handle: heat-loss-corridor, unheated-corridor-length, flat-storey-count, flat-top-storey, "
"floor-level"
)
heat_loss_corridor = "NO DATA!"
unheated_corridor_length = ""
flat_storey_count = ""
flat_top_storey = ""
floor_level = "NO DATA!"
self.epc = {
"uprn": self.uprn,
"uprn-source": "Address Matched",
"property-type": property_type,
"building-reference-number": "",
**self.get_sap(),
**self.get_property_address(),
"low-energy-fixed-light-count": self.get_node_value('Low-Energy-Fixed-Lighting-Outlets-Count'),
# TODO: Needs to be done more carefully
# "floor-height" = self.get_node_value_from_floor_dimensions('Room-Height'),
"construction-age-band": self.get_node_value('Construction-Age-Band'),
"mainheat-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Main-Heating', 'Energy-Efficiency-Rating')
],
"windows-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Window', 'Environmental-Efficiency-Rating')
],
"lighting-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Lighting', 'Energy-Efficiency-Rating')
],
"environment-impact-potential": self.get_energy_assessment_value('Environmental-Impact-Potential'),
# TODO: Needs to be done more careully since we have multiple windows
# "glazed-type": self.get_node_value('Glazing-Type'),
"mainheatcont-description":
self.get_property_summary_value('Main-Heating-Controls', 'Description'),
"sheating-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Secondary-Heating', 'Energy-Efficiency-Rating')
],
"local-authority": "", # Not included in the xml
"local-authority-label": "",
"fixed-lighting-outlets-count": self.get_node_value('Fixed-Lighting-Outlets-Count'),
# TODO: Doesn't seem to be included in the xml
# "energy-tariff": self.get_node_value('Energy-Tariff'),
"mechanical-ventilation": self.MECHANICAL_VENTILATION_MAP[self.get_node_value('Mechanical-Ventilation')],
"solar-water-heating-flag": self.get_node_value('Solar-Water-Heating'),
"co2-emissions-potential": self.get_energy_assessment_value('CO2-Emissions-Potential'),
"number-heated-rooms": self.get_node_value('Heated-Room-Count'),
"floor-description": self.get_property_summary_value('Floor', 'Description'),
"energy-consumption-potential": self.get_energy_assessment_value('Energy-Consumption-Potential'),
"built-form": self.BUILT_FORM_MAP[self.get_node_value('Built-Form')],
"number-open-fireplaces": self.get_node_value('Open-Fireplaces-Count'),
"windows-description": self.get_property_summary_value('Window', 'Description'),
"glazed-area": self.GLAZED_AREA_MAP[self.get_node_value('Glazed-Area')],
"inspection-date": self.get_node_value('Inspection-Date'),
"mains-gas-flag": self.get_node_value('Mains-Gas'),
"co2-emiss-curr-per-floor-area": self.get_energy_assessment_value('CO2-Emissions-Current-Per-Floor-Area'),
"heat-loss-corridor": heat_loss_corridor,
"unheated-corridor-length": unheated_corridor_length,
"flat-storey-count": flat_storey_count,
"roof-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Roof', 'Energy-Efficiency-Rating')
],
"total-floor-area": self.get_node_value('Total-Floor-Area'),
"environment-impact-current": self.get_energy_assessment_value('Environmental-Impact-Current'),
"roof-description": self.get_property_summary_value('Roof', 'Description'),
"floor-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Floor', 'Energy-Efficiency-Rating')
],
"number-habitable-rooms": self.get_node_value('Habitable-Room-Count'),
"hot-water-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Hot-Water', 'Environmental-Efficiency-Rating')
],
"mainheatc-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Main-Heating-Controls', 'Energy-Efficiency-Rating')
],
"main-fuel": self.FUEL_TYPE_MAP[self.get_node_value('Main-Fuel-Type')],
"lighting-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Lighting', 'Environmental-Efficiency-Rating')
],
"windows-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Window', 'Energy-Efficiency-Rating')
],
"floor-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Floor', 'Environmental-Efficiency-Rating')
],
"sheating-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Secondary-Heating', 'Environmental-Efficiency-Rating')
],
"lighting-description": self.get_property_summary_value('Lighting', 'Description'),
"roof-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Roof', 'Environmental-Efficiency-Rating')
],
"walls-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Wall', 'Energy-Efficiency-Rating')
],
"photo-supply": self.get_photo_supply(),
"lighting-cost-potential": self.get_energy_assessment_value('Lighting-Cost-Potential'),
"mainheat-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Main-Heating', 'Environmental-Efficiency-Rating')
],
"multi-glaze-proportion": self.get_node_value('Multiple-Glazed-Proportion'),
"main-heating-controls": self.get_property_summary_value('Main-Heating-Controls', 'Description'),
"flat-top-storey": flat_top_storey,
"secondheat-description": self.get_property_summary_value('Secondary-Heating', 'Description'),
"walls-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Wall', 'Environmental-Efficiency-Rating')
],
"transaction-type": self.TRANSACTION_TYPE_MAP[self.get_node_value('Transaction-Type')],
"extension-count": self.get_node_value('Extensions-Count'),
"mainheatc-env-eff": self.RATINGS_MAP[
self.get_property_summary_value('Main-Heating-Controls', 'Environmental-Efficiency-Rating')
],
"lmk-key": "", # Doesn't exist for non-EPC xmls
"wind-turbine-count": self.get_node_value('Wind-Turbines-Count'),
"tenure": self.TENURE_MAP[self.get_node_value('Tenure')],
"floor-level": floor_level,
"potential-energy-efficiency": self.get_energy_assessment_value('Energy-Rating-Potential'),
"potentual-energy-rating": sap_to_epc(float(self.get_energy_assessment_value('Energy-Rating-Potential'))),
"hot-water-energy-eff": self.RATINGS_MAP[
self.get_property_summary_value('Hot-Water', 'Energy-Efficiency-Rating')
],
"low-energy-lighting": self.get_node_value('Low-Energy-Lighting'),
"walls-description": self.get_property_summary_value('Wall', 'Description'),
"hotwater-description": self.get_property_summary_value('Hot-Water', 'Description'),
"co2-emissions-current": self.get_node_value('CO2-Emissions-Current'),
"heating-cost-current": self.get_node_value('Heating-Cost-Current'),
"heating-cost-potential": self.get_energy_assessment_value('Heating-Cost-Potential'),
"hot-water-cost-current": self.get_node_value('Hot-Water-Cost-Current'),
"hot-water-cost-potential": self.get_energy_assessment_value('Hot-Water-Cost-Potential'),
"lighting-cost-current": self.get_node_value('Lighting-Cost-Current'),
"energy-consumption-current": self.get_node_value('Energy-Consumption-Current'),
"lodgement-date": self.get_node_value('Inspection-Date'),
"lodgement-datetime":
datetime.strptime(self.get_node_value('Inspection-Date'), "%Y-%m-%d").isoformat(),
"mainheat-description": self.get_property_summary_value('Main-Heating', 'Description'),
}
def get_node_value(self, tag_name):
nodes = self.xml.getElementsByTagName(tag_name)
if nodes and nodes[0].firstChild:
return nodes[0].firstChild.nodeValue
return None
def get_node_value_from_floor_dimensions(self, tag_name):
nodes = self.xml.getElementsByTagName('SAP-Floor-Dimension')
if nodes:
tag = nodes[0].getElementsByTagName(tag_name)
if tag and tag[0].firstChild:
return tag[0].firstChild.nodeValue
return None
def get_property_summary_value(self, section, tag_name):
nodes = self.xml.getElementsByTagName('Property-Summary')[0].getElementsByTagName(section)
if nodes:
tag = nodes[0].getElementsByTagName(tag_name)
if tag and tag[0].firstChild:
return tag[0].firstChild.nodeValue
return None
def get_energy_assessment_value(self, tag_name):
nodes = self.xml.getElementsByTagName('Energy-Assessment')[0]
if nodes:
tag = nodes.getElementsByTagName(tag_name)
if tag and tag[0].firstChild:
return tag[0].firstChild.nodeValue
return None
def get_uprn(self, uprn):
if uprn is not None:
self.uprn = uprn
return
uprn_tag = self.xml.getElementsByTagName('UPRN')[0].firstChild
if uprn_tag is None:
self.uprn = -1
return
self.uprn = uprn_tag.nodeValue
# If all of the characters in the UPRN are 0, then there is not set UPRN
if self.uprn.count("0") == len(self.uprn):
self.uprn = 0
else:
self.uprn = self.uprn.lower().split("uprn-")[1]
def get_property_type(self):
if not self.xml:
raise ValueError("You need to read the file first")
property_type = self.xml.getElementsByTagName('Property-Type')
if not property_type:
property_type = self.xml.getElementsByTagName('PropertyType1')
return PROPERTY_TYPE_LOOKUP[property_type[0].firstChild.nodeValue]
def get_sap(self):
sap_score = self.xml.getElementsByTagName('Energy-Rating-Current')
sap_score = int(sap_score[0].firstChild.nodeValue)
epc_rating = sap_to_epc(sap_score)
return {
"current-energy-efficiency": str(sap_score),
"current-energy-rating": epc_rating
}
def get_heating_and_emissions_data(self):
"""
This method will extract the following pieces of information:
1) Space heating requirement
2) Water heating requirement
3) CO2 emissions
4) Heat demand per square meter per year
5) Bills
:return:
"""
self.space_heating_kwh = self.xml.getElementsByTagName(
'Space-Heating-Existing-Dwelling'
)[0].firstChild.nodeValue
self.water_heating_kwh = self.xml.getElementsByTagName('Water-Heating')[0].firstChild.nodeValue
def get_detailed_heating_specs(self):
"""
Given the heating data that is found in the <SAP-Heating> tag, we extract the detailed about the heating
system
:return:
"""
sap_main_heating_details = (
self.xml.getElementsByTagName('SAP-Heating')[0]
.getElementsByTagName("Main-Heating-Details")[0]
.getElementsByTagName("Main-Heating")[0]
)
heating_code = sap_main_heating_details.getElementsByTagName("SAP-Main-Heating-Code")[0].firstChild.nodeValue
# Get the heating system
heating_system = heating_data[heating_data["code"] == int(heating_code)]["description"]
heating_system = heating_system.values[0] if not heating_system.empty else f"Heating code: {heating_code}"
# Get the heating controls
heating_controls_code = (
sap_main_heating_details.getElementsByTagName("Main-Heating-Control")[0].firstChild.nodeValue
)
heating_controls = heating_data[heating_data["code"] == int(heating_controls_code)]["description"]
heating_controls = (
heating_controls.values[0] if not heating_controls.empty else f"Heating Controls code: {heating_code}"
)
self.heating_system = heating_system
self.heating_controls = heating_controls
def get_doors(self):
# Doors can be found in the SAP-Property-Details tag
self.number_of_doors = int(
self.xml.getElementsByTagName('SAP-Property-Details')[0]
.getElementsByTagName('Door-Count')[0]
.firstChild.nodeValue
)
self.number_of_insulated_doors = int(
self.xml.getElementsByTagName('SAP-Property-Details')[0]
.getElementsByTagName('Insulated-Door-Count')[0]
.firstChild.nodeValue
)
def get_photo_supply(self):
photo_supply_tag = self.xml.getElementsByTagName("Photovoltaic-Supply")[0]
# Check if the "None-Or-No-Details" tag is present
if photo_supply_tag.getElementsByTagName("None-Or-No-Details"):
return (
photo_supply_tag.
getElementsByTagName("None-Or-No-Details")[0].
getElementsByTagName("Percent-Roof-Area")[0].
firstChild.nodeValue
)
else:
raise NotImplementedError("Implement me")
def get_assessor_details(self):
energy_assessor_tag = self.xml.getElementsByTagName('Energy-Assessor')[0]
self.surveyor_name = (
energy_assessor_tag.getElementsByTagName("Name")[0].firstChild.nodeValue
)
def get_property_address(self):
property_tag = self.xml.getElementsByTagName("Property")[0]
address1 = self.get_node(property_tag.getElementsByTagName("Address-Line-1")[0])
address2 = self.get_node(property_tag.getElementsByTagName("Address-Line-2")[0])
address3 = self.get_node(property_tag.getElementsByTagName("Address-Line-3")[0])
posttown = self.get_node(property_tag.getElementsByTagName("Post-Town")[0])
postcode = self.get_node(property_tag.getElementsByTagName("Postcode")[0])
address = ", ".join(
[x for x in [address1, address2, address3] if x is not None]
)
county = property_tag.getElementsByTagName("County")
if county:
county = county[0].firstChild.nodeValue
# Seems to be unavailable in the xml
constituency = None
constituency_label = None
return {
"address1": address1,
"address2": address2,
"address3": address3,
"posttown": posttown,
"postcode": postcode,
"address": address,
"county": county,
"constituency": constituency,
"constituency-label": constituency_label
}
def get_property_dimensions(self):
"""
This function will extract the relevant property dimensions including the floor area,
number of floors, perimeter, party wall length and the insulation_wall_area.
insulation_wall_area is typically simplified down to perimeter * height * 0.85
:return:
"""
# Each floor has its own SAP-Floor-Dimension tag
floor_dimensions = (
self.xml.getElementsByTagName("SAP-Floor-Dimensions")[0]
.getElementsByTagName("SAP-Floor-Dimension")
)
self.number_of_floors = len(floor_dimensions)
self.heat_loss_perimeter = float(
floor_dimensions[0].getElementsByTagName("Heat-Loss-Perimeter")[0].firstChild.nodeValue
)
self.party_wall_length = float(
floor_dimensions[0].getElementsByTagName("Party-Wall-Length")[0].firstChild.nodeValue
)
party_wall_construction_tag = (
self.xml.getElementsByTagName("Party-Wall-Construction")[0].firstChild.nodeValue.replace("\n", "").strip()
)
self.is_there_party_wall = (
"Yes" if (self.party_wall_length > 0) or (party_wall_construction_tag != "") else "No"
)
# We pull out all of the floor areas
floor_areas = [
float(x.getElementsByTagName("Total-Floor-Area")[0].firstChild.nodeValue) for x in floor_dimensions
]
self.total_floor_area = sum(floor_areas)
self.ground_floor_area = floor_areas[0]
self.floor_height = float(
floor_dimensions[0]
.getElementsByTagName("Room-Height")[0]
.firstChild.nodeValue
)
self.insulation_wall_area = self.heat_loss_perimeter * self.floor_height * self.INSULATION_WALL_AREA_FACTOR
self.perimeter = self.heat_loss_perimeter + self.party_wall_length
def get_floor_dimensions(self):
"""
Extracts physical measurements of the property such as the floor area, room height, etc.
across the main dwelling and any extensions.
:return:
"""
def get_part_value(node, tag_name):
element = node.getElementsByTagName(tag_name)
if element and element[0].firstChild:
return element[0].firstChild.nodeValue
return None
# Each part will correspond to the main
sap_building_parts = self.xml.getElementsByTagName("SAP-Building-Part")
floor_dimensions = []
for building_part in sap_building_parts:
building_part_identifier = building_part.getElementsByTagName("Identifier")[0].firstChild.nodeValue
sap_floor_dimensions = building_part.getElementsByTagName("SAP-Floor-Dimension")
data = [
{
'building_part_identifier': building_part_identifier,
'floor': get_part_value(floor_dimension, 'Floor'),
'floor_construction': get_part_value(floor_dimension, 'Floor-Construction'),
'floor_insulation': get_part_value(floor_dimension, 'Floor-Insulation'),
'heat_loss-perimeter': get_part_value(floor_dimension, 'Heat-Loss-Perimeter'),
'party_wall-length': get_part_value(floor_dimension, 'Party-Wall-Length'),
'total_floor-area': get_part_value(floor_dimension, 'Total-Floor-Area'),
'room_height': get_part_value(floor_dimension, 'Room-Height')
} for floor_dimension in sap_floor_dimensions
]
floor_dimensions.extend(data)
self.floor_dimensions = floor_dimensions