From 791e22146e6354291ebf56b61aeee3423286a609 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 25 Jul 2024 12:18:48 +0100 Subject: [PATCH] set up fundamental epc extraction --- etl/bill_savings/data_collection.py | 8 +- etl/bill_savings/data_combining.py | 2 +- etl/xml_survey_extraction/XmlParser.py | 200 ++++++++++++++++++++++++- 3 files changed, 198 insertions(+), 12 deletions(-) diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index d2283ac4..6095741f 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -133,8 +133,8 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): # Skip the first 50 - if i < 250: - continue + # if i < 344: + # continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns @@ -146,12 +146,12 @@ def app(): # Take just the newest EPC per uprn, based on lodgement-date data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") - data = data.sample(sample_size) + data = data.sample(sample_size, replace=False) # We use the addreess data to find the related information collected_data = [] for _, property_data in data.iterrows(): - time.sleep(np.random.uniform(0.3, 2)) + time.sleep(np.random.uniform(0.2, 1.5)) uprn = int(property_data["uprn"]) address = property_data["address1"] diff --git a/etl/bill_savings/data_combining.py b/etl/bill_savings/data_combining.py index 11366360..d3a8d679 100644 --- a/etl/bill_savings/data_combining.py +++ b/etl/bill_savings/data_combining.py @@ -94,7 +94,7 @@ def app(): # We also estimate the energy consumption reduction from this data, by band df["total_consumption"] = df["heating_kwh"] + df["hot_water_kwh"] - consumption_averages = df.groupby("current-energy-rating")["total_consumption"].meam().reset_index() + consumption_averages = df.groupby("current-energy-rating")["total_consumption"].mean().reset_index() # Save the consumption averages back to s3 save_dataframe_to_s3_parquet( diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index de7e35f8..973ea5e8 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -55,6 +55,7 @@ class XmlParser: hot_water_cost_current = None lighting_cost_current = None energy_consumption_current = None + energy_consumption_potential = None heating_system = None heating_controls = None @@ -140,6 +141,30 @@ class XmlParser: "5": "Very Good" } + MECHANICAL_VENTILATION_MAP = { + "0": "natural" + } + + BUILT_FORM_MAP = { + "1": "Detached", + } + + GLAZED_AREA_MAP = { + "4": "Much More Than Typical" + } + + FUEL_TYPE_MAP = { + "26": "mains gas (not community)" + } + + TRANSACTION_TYPE_MAP = { + "13": "ECO assessment" + } + + TENURE_MAP = { + '1': "Owner-occupied" + } + def __init__(self, file, filekey, uprn=None): file.seek(0) # Ensure the file pointer is at the beginning xml_string = file.read().decode('utf-8') @@ -151,7 +176,7 @@ class XmlParser: # In order to identify the file type, we can look for the presence of the 'UPRN' tag # If the UPRN tag is present, we can assume that the file is an EPC # If the UPRN tag is not present, we can assume that the file is an EPR - self.get_uprn() + self.get_uprn(uprn) self.file_type = self.UPRN_FILETYPE_MAP.get(self.uprn, "EPC") @@ -180,6 +205,7 @@ class XmlParser: self.get_assessor_details() self.get_heating_and_emissions_data() + self.get_detailed_heating_specs() # Building fabric @@ -191,11 +217,160 @@ class XmlParser: self.get_hot_water() self.get_lighting() self.get_doors() - self.get_photo_supply() # Property dimensions self.get_property_dimensions() + # Get all of the EPC data + self.extract_epc() + + def extract_epc(self): + # Property Summary + low_energy_fixed_light_count = None + construction_age_band = None + self.epc = { + "low-energy-fixed-light-count": self.get_node_value('Low-Energy-Fixed-Lighting-Outlets-Count'), + # TODO: Needs to be done more carefully + # "floor-height" = self.get_node_value_from_floor_dimensions('Room-Height'), + "construction-age-band": self.get_node_value('Construction-Age-Band'), + "mainheat-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Main-Heating', 'Energy-Efficiency-Rating') + ], + "windows-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Window', 'Environmental-Efficiency-Rating') + ], + "lighting-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Lighting', 'Energy-Efficiency-Rating') + ], + "environment-impact-potential": self.get_energy_assessment_value('Environmental-Impact-Potential'), + # TODO: Needs to be done more careully since we have multiple windows + # "glazed-type": self.get_node_value('Glazing-Type'), + "mainheatcont-description": + self.get_property_summary_value('Main-Heating-Controls', 'Description'), + "sheating-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Secondary-Heating', 'Energy-Efficiency-Rating'), + ], + # TODO: Doesn't seem to be included in the xml + # "local-authority": self.get_node_value('Local-Authority'), + "local-authority-label": self.get_node_value('Local-Authority-Label'), + "fixed-lighting-outlets-count": self.get_node_value('Fixed-Lighting-Outlets-Count'), + # TODO: Doesn't seem to be included in the xml + # "energy-tariff": self.get_node_value('Energy-Tariff'), + "mechanical-ventilation": self.MECHANICAL_VENTILATION_MAP[self.get_node_value('Mechanical-Ventilation')], + "solar-water-heating-flag": self.get_node_value('Solar-Water-Heating'), + "co2-emissions-potential": self.get_energy_assessment_value('CO2-Emissions-Potential'), + "number-heated-rooms": self.get_node_value('Heated-Room-Count'), + "floor-description": self.get_property_summary_value('Floor', 'Description'), + "energy-consumption-potential": self.get_energy_assessment_value('Energy-Consumption-Potential'), + "built-form": self.BUILT_FORM_MAP[self.get_node_value('Built-Form')], + "number-open-fireplaces": self.get_node_value('Open-Fireplaces-Count'), + "windows-description": self.get_property_summary_value('Window', 'Description'), + "glazed-area": self.GLAZED_AREA_MAP[self.get_node_value('Glazed-Area')], + "inspection-date": self.get_node_value('Inspection-Date'), + "mains-gas-flag": self.get_node_value('Mains-Gas'), + "co2-emiss-curr-per-floor-area": self.get_energy_assessment_value('CO2-Emissions-Current-Per-Floor-Area'), + # TODO: Not included in the xml for houses - need an example of flats + # "heat-loss-corridor": self.get_node_value('Heat-Loss-Perimeter'), + # TODO: Need an example of flats + # "flat-storey-count": self.get_node_value('Flat-Storey-Count'), + "roof-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Roof', 'Energy-Efficiency-Rating') + ], + "total-floor-area": self.get_node_value('Total-Floor-Area'), + "environment-impact-current": self.get_energy_assessment_value('Environmental-Impact-Current'), + "roof-description": self.get_property_summary_value('Roof', 'Description'), + "floor-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Floor', 'Energy-Efficiency-Rating') + ], + "number-habitable-rooms": self.get_node_value('Habitable-Room-Count'), + "hot-water-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Hot-Water', 'Environmental-Efficiency-Rating') + ], + "mainheatc-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Main-Heating-Controls', 'Energy-Efficiency-Rating') + ], + "main-fuel": self.FUEL_TYPE_MAP[self.get_node_value('Main-Fuel-Type')], + "lighting-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Lighting', 'Environmental-Efficiency-Rating') + ], + "windows-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Window', 'Energy-Efficiency-Rating') + ], + "floor-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Floor', 'Environmental-Efficiency-Rating') + ], + "sheating-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Secondary-Heating', 'Environmental-Efficiency-Rating') + ], + "lighting_description": self.get_property_summary_value('Lighting', 'Description'), + "roof-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Roof', 'Environmental-Efficiency-Rating') + ], + "walls-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Wall', 'Energy-Efficiency-Rating') + ], + "photo-supply": self.get_photo_supply(), + "lighting-cost-potential": self.get_energy_assessment_value('Lighting-Cost-Potential'), + "mainheat-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Main-Heating', 'Environmental-Efficiency-Rating') + ], + "multi-glaze-proportion": self.get_node_value('Multiple-Glazed-Proportion'), + "main-heating-controls": self.get_property_summary_value('Main-Heating-Controls', 'Description'), + # TODO: NEdd an example of flats + # "flat-top-storey": self.get_node_value('Flat-Top-Storey'), + "secondheat-description": self.get_property_summary_value('Secondary-Heating', 'Description'), + "walls-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Wall', 'Environmental-Efficiency-Rating') + ], + "transaction-type": self.TRANSACTION_TYPE_MAP[self.get_node_value('Transaction-Type')], + "extension-count": self.get_node_value('Extensions-Count'), + "mainheatc-env-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Main-Heating-Controls', 'Environmental-Efficiency-Rating') + ], + "lmk-key": "", # Doesn't exist for non-EPC xmls + "wind-turbines-count": self.get_node_value('Wind-Turbines-Count'), + "tenure": self.TENURE_MAP[self.get_node_value('Tenure')], + # TODO: Need an example of flats + # "floor-level": self.get_node_value('Floor-Level'), + "potential-energy-efficiency": self.get_energy_assessment_value('Energy-Rating-Potential'), + "hot-water-energy-eff": self.RATINGS_MAP[ + self.get_property_summary_value('Hot-Water', 'Energy-Efficiency-Rating') + ], + "low-energy-lighting": self.get_node_value('Low-Energy-Lighting'), + "walls-description": self.get_property_summary_value('Wall', 'Description'), + "hotwater-description": self.get_property_summary_value('Hot-Water', 'Description'), + } + + def get_node_value(self, tag_name): + nodes = self.xml.getElementsByTagName(tag_name) + if nodes and nodes[0].firstChild: + return nodes[0].firstChild.nodeValue + return None + + def get_node_value_from_floor_dimensions(self, tag_name): + nodes = self.xml.getElementsByTagName('SAP-Floor-Dimension') + if nodes: + tag = nodes[0].getElementsByTagName(tag_name) + if tag and tag[0].firstChild: + return tag[0].firstChild.nodeValue + return None + + def get_property_summary_value(self, section, tag_name): + nodes = self.xml.getElementsByTagName('Property-Summary')[0].getElementsByTagName(section) + if nodes: + tag = nodes[0].getElementsByTagName(tag_name) + if tag and tag[0].firstChild: + return tag[0].firstChild.nodeValue + return None + + def get_energy_assessment_value(self, tag_name): + nodes = self.xml.getElementsByTagName('Energy-Assessment')[0] + if nodes: + tag = nodes.getElementsByTagName(tag_name) + if tag and tag[0].firstChild: + return tag[0].firstChild.nodeValue + return None + def get_uprn(self, uprn): if uprn is not None: @@ -253,9 +428,14 @@ class XmlParser: self.heating_cost_current = self.xml.getElementsByTagName('Heating-Cost-Current')[0].firstChild.nodeValue self.hot_water_cost_current = self.xml.getElementsByTagName('Hot-Water-Cost-Current')[0].firstChild.nodeValue self.lighting_cost_current = self.xml.getElementsByTagName('Lighting-Cost-Current')[0].firstChild.nodeValue + + # Energy consumption self.energy_consumption_current = ( self.xml.getElementsByTagName("Energy-Consumption-Current")[0].firstChild.nodeValue ) + self.energy_consumption_potential = ( + self.xml.getElementsByTagName("Energy-Consumption-Potential")[0].firstChild.nodeValue + ) def get_detailed_heating_specs(self): """ @@ -457,11 +637,17 @@ class XmlParser: ) def get_photo_supply(self): - self.photo_supply = float( - self.xml.getElementsByTagName('Photovoltaic-Supply')[0] - .getElementsByTagName('Percent-Roof-Area')[0] - .firstChild.nodeValue - ) + photo_supply_tag = self.xml.getElementsByTagName("Photovoltaic-Supply")[0] + # Check if the "None-Or-No-Details" tag is present + if photo_supply_tag.getElementsByTagName("None-Or-No-Details"): + return ( + photo_supply_tag. + getElementsByTagName("None-Or-No-Details")[0]. + getElementsByTagName("Percent-Roof-Area")[0]. + firstChild.nodeValue + ) + else: + raise NotImplementedError("Implement me") def get_assessor_details(self):