diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index cd76dae4..913a04b8 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -26,6 +26,20 @@ class RetrieveFindMyEpc: self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + @staticmethod + def extract_low_carbon_sources(soup): + # Find the section header + section_header = soup.find("h3", string="Low and zero carbon energy sources") + if not section_header: + return {} + + # Locate the list following the header + energy_list = section_header.find_next("ul") + + # Extract the list items + sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")} + return sources + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): """ For a post code and address, we pull out all the required data from the find my epc website @@ -191,6 +205,9 @@ class RetrieveFindMyEpc: # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) + # 4) Low and zero carbon energy sources + low_carbon_energy_sources = self.extract_low_carbon_sources(address_res) + resulting_data = { 'epc_certificate': epc_certificate, 'current_epc_rating': current_rating.split(' ')[-6], @@ -200,7 +217,8 @@ class RetrieveFindMyEpc: "heating_text": heating_text, "hot_water_text": hot_water_text, "recommendations": recommendations, - **assessment_data + **assessment_data, + **low_carbon_energy_sources } return resulting_data @@ -246,6 +264,11 @@ class RetrieveFindMyEpc: ], "Band A condensing boiler": ["boiler_upgrade"], "Double glazing": ["double_glazing"], + "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"], + "Wind turbine": ["wind_turbine"], + "Loft insulation": ["loft_insulation"], + "Solar photovoltaic (PV) panels": ["solar_pv"], + "Party wall insulation": ["party_wall_insulation"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py new file mode 100644 index 00000000..060897f8 --- /dev/null +++ b/etl/route_march_data_pull/app.py @@ -0,0 +1,300 @@ +import os +import time + +import pandas as pd +import numpy as np +from tqdm import tqdm + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list, fulladdress_column, address1_column, postcode_column): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + try: + postcode = home[postcode_column] + house_number = home[address1_column] + full_address = home[fulladdress_column] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def extract_address1(asset_list, full_address_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" + DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" + + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list["row_id"] = asset_list.index + + # We clean up portential non-breaking spaces, and double spaces + for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) + asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) + + if ADDRESS1_COLUMN is None: + ADDRESS1_COLUMN = "address1_extracted" + asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + + epc_data, errors = get_data( + asset_list=asset_list, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data( + asset_list=asset_list_failed, + fulladdress_column=FULLADDRESS_COLUMN, + address1_column=ADDRESS1_COLUMN, + postcode_column=POSTCODE_COLUMN + ) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/route_march_data_pull/requirements.txt b/etl/route_march_data_pull/requirements.txt new file mode 100644 index 00000000..e69de29b