diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py new file mode 100644 index 00000000..96859f99 --- /dev/null +++ b/etl/customers/aiha/bid_numbers.py @@ -0,0 +1,92 @@ +""" +This is an adhoc script, used to pull together some of the figures that are being included in the +Warm Homes: Social Housing Wave 3 funding application +""" + +import pandas as pd +import numpy as np + +aiha_all_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="All Properties - AIHA", + header=2 +) +modelled_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="Modelled Properties - Measures", + header=5 +) +aiha_all_units = aiha_all_units.drop(columns=['Unnamed: 0', 'Unnamed: 1']) +aiha_extracted_property_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv" +) +aiha_wave_3_units = aiha_all_units[aiha_all_units["Expected Package Cost"].astype(float) > 0] +# TODO: The EPC C property isn't a C! +aiha_epc_breakdown = aiha_wave_3_units["Expected EPC Rating"].replace({"D or E": "E"}).value_counts() +# For CAHA +caha_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("CAHA") +]['Current EPC Rating'].value_counts() +# For Hornsey +hornsey_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("HORNSEY") +]['Current EPC Rating'].value_counts() + +aiha_original_asset_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/240924- KSQ & Domna Info Merge - AIHA - SHDF Wave 3 " + "bid - Supplementary information.xlsx", + sheet_name="Archetyping Data", + header=2 +) + +# Get the units in the bid: +aiha_wave_3_features = aiha_original_asset_data[ + ['Address letter or number', 'Street address', 'Postcode', "Wall type", + "Property type", "built-form", "floor"] +].merge( + aiha_wave_3_units[['Address letter or number', 'Street address', 'Postcode']], + how="inner", + on=["Address letter or number", "Street address", "Postcode"] +) + +wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts() +property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index() + +# Hornsey data - contained in original asset list +hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 +) + +# We don't need the first row +hornsey_asset_list = hornsey_asset_list.iloc[1:] +# Fill NA values with empty strings +hornsey_asset_list = hornsey_asset_list.fillna("") +hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str +).str.strip() +hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() +hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() +# Replace double spaces +for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + +hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + +hornsey_asset_list["Wall Type Cleaned"] = np.where( + hornsey_asset_list["Wall type"].str.contains("Cavity"), + "Cavity", + "Solid" +) + +hornsey_asset_list["Property type"].value_counts() + +# CAHA +caha_epc_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx" +) + +caha_epc_data["property_type"].value_counts() +caha_epc_data["wall_type"].value_counts() diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py index f96744ec..44baef80 100644 --- a/etl/customers/aiha/xml_extraction.py +++ b/etl/customers/aiha/xml_extraction.py @@ -92,9 +92,13 @@ def main(): # THis is the data we need for the AIHA project measures_data = extracted_surveys[ - ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", "number_of_floors"] + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", + "number_of_floors", "walls-description", "property-type", "built-form"] ] measures_data = measures_data.sort_values("survey_key", ascending=True) + measures_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv", + ) # Note: # The properties will still have "Very poor" ratings for their hot water diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 845ab634..96ea2b03 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -6,6 +6,7 @@ from etl.epc.settings import EARLIEST_EPC_DATE from dotenv import load_dotenv from tqdm import tqdm import pandas as pd +import numpy as np from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc from backend.SearchEpc import SearchEpc from utils.s3 import save_csv_to_s3 @@ -46,6 +47,12 @@ def hornsey(): hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + hornsey_asset_list["Wall Type Cleaned"] = np.where( + "Cavity" in hornsey_asset_list["Wall type"], + "Cavity", + "Solid" + ) + missed_uprns = { "Flat 13A Stowell House": 100021213098, "Flat 24 Stowell House": 100021213110, @@ -267,6 +274,9 @@ def caha(): "address": address, "postcode": home["Postcode"], "property_type": newest_epc["property-type"], + "wall_type": newest_epc["walls-description"], + "built_form": newest_epc["built-form"], + "flat_storey_count": newest_epc['flat-storey-count'], } ) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py new file mode 100644 index 00000000..14cd73be --- /dev/null +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -0,0 +1,235 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + address1 = home["address1"].split(",")[0] + full_address = home["Address"] + + searcher = SearchEpc( + address1=str(address1), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/Southend Planned programme.xlsx", + header=0, + sheet_name="Planned RM" + ) + asset_list["row_id"] = asset_list.index + asset_list["address1"] = asset_list["Address"].str.split(",").str[0] + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "photo-supply", + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "photo-supply": "% of the Roof with PV" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " + "2024.xlsx") + asset_list.to_excel(filename, index=False) + + asset_list["% of the Roof with PV"].value_counts() + + asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]] diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8791912a..a5bbff7b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -117,6 +117,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ + data = { "Address": None, "Postcode": None,