From 3a2a45ba9bc8de1555865fec8eac44823210b084 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Sep 2023 12:08:55 +0100 Subject: [PATCH] basic process for creating of area dataset complete --- model_data/simulation_system/area_data.py | 89 ++++++++++++++++++- .../requirements/area_data.txt | 4 +- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py index e381b0e5..f43093f1 100644 --- a/model_data/simulation_system/area_data.py +++ b/model_data/simulation_system/area_data.py @@ -2,13 +2,17 @@ This script produces the dataset used to model the wall area of properties, which is used to estimate the cost of insulation measures within homes """ +import os import boto3 import PyPDF2 import re import json +from epc_api.client import EpcClient from io import BytesIO +from datetime import datetime bucket = "retrofit-datalake-dev" +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) def list_files_in_s3_folder(bucket_name, folder_name): @@ -173,7 +177,10 @@ def check_page(pages, page_num, page_index): return True -def handler(): +def extract_areas(): + """ + This function extracts the net and gross wall areas from the pdf sap calculation files + """ files = list_files_in_s3_folder(bucket, "full_sap_calculations") # get pdfs @@ -215,6 +222,10 @@ def handler(): property_reference_number = pdf_pages[pagenum][page_idx + 2] address = pdf_pages[pagenum][page_idx + 4] + # Search for issued date - the date appears in the field before + _, date_pagenum, date_page_idx = search_pages(pdf_pages, 'Issued on Date') + issued_date = pdf_pages[date_pagenum][date_page_idx + -1] + data.append( { "property_reference": property_reference, @@ -222,8 +233,82 @@ def handler(): "address": address, "gross_area": gross_area, "net_area": net_area, - "filename": sap_calculation_file + "filename": sap_calculation_file, + "issued_date": issued_date, } ) write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data)) + + return data + + +def combine_area_data(area_data=None): + """ + This function will merge the area data onto additional features which are + :param area_data: list of dictionaries, containing the areas and the + """ + + epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + model_data = [] + for area_config in area_data: + address = area_config["address"] + + # The addresses seems to have this structure: + # , , , " + # Where area is not always in the address + address_destructured = address.split(",") + house_number = address_destructured[0].strip() + street_name = address_destructured[1].strip() + postcode = address_destructured[-1].strip() + + # Fetch epc data + epc_response = epc_client.domestic.search( + params={ + "postcode": postcode, + "address": ", ".join([house_number, street_name]), + } + ) + + epc_data = epc_response["rows"] + if len(epc_data) == 0: + raise ValueError("No EPC data - investigate me") + + if len(epc_data) > 1: + issued_date = datetime.strptime(area_config["issued_date"], '%d/%m/%Y') + # We get the epc data closest to the issued date. On the edge case that we have two EPC records that are + # equally far away from the issued_date, we take the most recent EPC record + # We sort on lodgement date + epc_data = sorted( + epc_data, key=lambda x: datetime.strptime(x['lodgement-date'], '%Y-%m-%d'), reverse=True + ) + + days_since = [ + abs((datetime.strptime(x["lodgement-date"], '%Y-%m-%d') - issued_date).days) for x in epc_data + ] + # find the locaton of the closest + closest_index = [i for i, days in enumerate(days_since) if days == min(days_since)][0] + # Take just that epc record + epc_data = [epc_data[closest_index]] + + model_data.append( + dict( + gross_area=area_config["gross_area"], + net_area=area_config["net_area"], + **epc_data[0] + ) + ) + + # Save data + write_json_to_s3( + bucket_name=bucket, + file_name="wall-area-data/wall-area-model-data.json", + json_data=json.dumps(model_data) + ) + + +def handler(): + area_data = extract_areas() + + combine_area_data(area_data) diff --git a/model_data/simulation_system/requirements/area_data.txt b/model_data/simulation_system/requirements/area_data.txt index f6bff53c..321d52e8 100644 --- a/model_data/simulation_system/requirements/area_data.txt +++ b/model_data/simulation_system/requirements/area_data.txt @@ -1,2 +1,4 @@ boto3==1.28.38 -PyPDF2==3.0.1 \ No newline at end of file +PyPDF2==3.0.1 +pydantic==1.10.11 +epc-api-python==1.0.2 \ No newline at end of file