From 1bbc89002cc448813cf06d9bf6f1facbc7bc25ca Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 23 Dec 2023 13:57:51 +0000 Subject: [PATCH] building ha7 pipeline --- backend/SearchEpc.py | 4 +- etl/eligibility/Eligibility.py | 3 +- etl/eligibility/ha_15_32/ha7_app.py | 155 ++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 etl/eligibility/ha_15_32/ha7_app.py diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index d8ea6b78..f1cda010 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -202,7 +202,9 @@ class SearchEpc: return {}, [] if len(newest_response) != 1: - raise Exception("More than one result found for this address - investigate me") + # It is possible (but rare, and likely an error on EPC lodgement) that we have multiple EPCs that + # were lodged at the exact same time. In this case, we will take the first one + newest_response = [newest_response[0]] older_epcs = [epc for epc in list_of_epcs if epc["lmk-key"] != newest_response[0]["lmk-key"]] diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index c4dc9de0..7a6fade1 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -331,9 +331,10 @@ class Eligibility: is_eligible = self.cavity["suitability"] & self.loft["suitability"] if post_retrofit_sap is None: + message = "subject to post retrofit sap" if is_eligible else "not eligible" self.eco4_warmfront = { "eligible": is_eligible, - "message": "subject to post retrofit sap" + "message": message } return diff --git a/etl/eligibility/ha_15_32/ha7_app.py b/etl/eligibility/ha_15_32/ha7_app.py new file mode 100644 index 00000000..139943a1 --- /dev/null +++ b/etl/eligibility/ha_15_32/ha7_app.py @@ -0,0 +1,155 @@ +import msgpack +import openpyxl +from openpyxl.styles.colors import COLOR_INDEX +from pathlib import Path +from datetime import datetime +import pandas as pd +import numpy as np +from utils.s3 import read_from_s3 +from utils.logger import setup_logger +from dotenv import load_dotenv +from backend.app.utils import read_parquet_from_s3 +from tqdm import tqdm +from backend.SearchEpc import SearchEpc +from etl.eligibility.Eligibility import Eligibility +from etl.eligibility.ha_15_32.app import prepare_model_data_row +from etl.epc.DataProcessor import DataProcessor +from etl.epc.settings import COLUMNS_TO_MERGE_ON +from backend.ml_models.api import ModelApi + +import re + +ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" + +logger = setup_logger() +load_dotenv(ENV_FILE) + + +def load_data(): + """ + Load the data from the excel + """ + + workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 7 ASSET LIST.xlsx') + sheet = workbook.active + + # Prepare lists to collect rows data and their colors + rows_data = [] + rows_colors = [] + for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + row_color = COLOR_INDEX[row_color] + rows_data.append(row_data) + rows_colors.append(row_color) + + df = pd.DataFrame(rows_data, columns=[cell.value for cell in sheet[1]]) + + # Add the row colors as a new column + df['row_color'] = rows_colors + df.columns.values[8] = "is_active" + + # Remove None columns + df = df.dropna(axis=1, how='all') + # We now parse the colours + df["row_color"].unique() + df["row_colour_name"] = np.where( + df["row_color"] == "0000FFFF", "red", + np.where(df["row_color"] == "00FF00FF", "green", "yellow") + ) + df["row_code"] = np.where( + df["row_colour_name"] == "red", "invalid", + np.where(df["row_colour_name"] == "green", "potential ECO4", "needs criteria change") + ) + + return df + + +def get_ha7_data(data, cleaned, cleaning_data, created_at): + property_type_lookup = { + "Mid Terrace": "Mid-Terrace", + "End Terrace": "End-Terrace", + "Semi Detached": "Semi-Detached", + "Detached": "Detached", + } + + scoring_data = [] + results = [] + nodata = [] + for _, house in tqdm(data.iterrows(), total=len(data)): + + searcher = SearchEpc( + address1=house["Address"], + postcode=house["Postcode"] + ) + + response = searcher.search() + if response["status"] == 204: + nodata.append(house) + continue + + newest_epc, older_epcs, full_sap_epc = searcher.retrieve( + property_type=property_type_lookup.get(house["Property Type"], None), + address=house["Address"], + ) + + eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) + eligibility.check_gbis_warmfront() + eligibility.check_eco4_warmfront() + + # If the house is not identified, we do a full gbis and eco4 check + eligibility.check_gbis() + eligibility.check_eco4() + + if eligibility.eco4_warmfront["eligible"]: + scoring_dictionary = prepare_model_data_row( + property_id=house["row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc + ) + scoring_data.extend(scoring_dictionary) + + # If nothing is eligible or gbis is eligible, then we make a record this + results.append( + { + "row_id": house["row_id"], + "address": house["Address"], + "postcode": house["Postcode"], + "gbis_eligible": eligibility.gbis_warmfront, + "eco4_eligible": eligibility.eco4_warmfront["eligible"], + "eco4_message": eligibility.eco4_warmfront["message"], + "sap": float(eligibility.epc["current-energy-efficiency"]), + "gbis_eligible_future": eligibility.gbis["eligible"], + "gbis_eligible_future_message": eligibility.gbis["message"], + "eco4_eligible_future": eligibility.eco4["eligible"], + "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components + "roof": eligibility.roof["clean_description"], + "walls": eligibility.walls["clean_description"], + "heating": eligibility.epc["mainheat-description"], + "tenure": eligibility.tenure, + "date_epc": eligibility.epc["lodgement-date"], + } + ) + + +def app(): + data = load_data() + data["row_id"] = ["ha7" + str(i) for i in range(0, len(data))] + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + cleaning_data = read_parquet_from_s3( + bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + ) + + created_at = datetime.now().isoformat()