basic process for creating of area dataset complete

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-04 12:08:55 +01:00
parent 2ee9ba9ddd
commit 3a2a45ba9b
2 changed files with 90 additions and 3 deletions

View file

@ -2,13 +2,17 @@
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
of insulation measures within homes
"""
import os
import boto3
import PyPDF2
import re
import json
from epc_api.client import EpcClient
from io import BytesIO
from datetime import datetime
bucket = "retrofit-datalake-dev"
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
def list_files_in_s3_folder(bucket_name, folder_name):
@ -173,7 +177,10 @@ def check_page(pages, page_num, page_index):
return True
def handler():
def extract_areas():
"""
This function extracts the net and gross wall areas from the pdf sap calculation files
"""
files = list_files_in_s3_folder(bucket, "full_sap_calculations")
# get pdfs
@ -215,6 +222,10 @@ def handler():
property_reference_number = pdf_pages[pagenum][page_idx + 2]
address = pdf_pages[pagenum][page_idx + 4]
# Search for issued date - the date appears in the field before
_, date_pagenum, date_page_idx = search_pages(pdf_pages, 'Issued on Date')
issued_date = pdf_pages[date_pagenum][date_page_idx + -1]
data.append(
{
"property_reference": property_reference,
@ -222,8 +233,82 @@ def handler():
"address": address,
"gross_area": gross_area,
"net_area": net_area,
"filename": sap_calculation_file
"filename": sap_calculation_file,
"issued_date": issued_date,
}
)
write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data))
return data
def combine_area_data(area_data=None):
"""
This function will merge the area data onto additional features which are
:param area_data: list of dictionaries, containing the areas and the
"""
epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
model_data = []
for area_config in area_data:
address = area_config["address"]
# The addresses seems to have this structure:
# <house number>, <street name>, <area (optional)>, <postcode>"
# Where area is not always in the address
address_destructured = address.split(",")
house_number = address_destructured[0].strip()
street_name = address_destructured[1].strip()
postcode = address_destructured[-1].strip()
# Fetch epc data
epc_response = epc_client.domestic.search(
params={
"postcode": postcode,
"address": ", ".join([house_number, street_name]),
}
)
epc_data = epc_response["rows"]
if len(epc_data) == 0:
raise ValueError("No EPC data - investigate me")
if len(epc_data) > 1:
issued_date = datetime.strptime(area_config["issued_date"], '%d/%m/%Y')
# We get the epc data closest to the issued date. On the edge case that we have two EPC records that are
# equally far away from the issued_date, we take the most recent EPC record
# We sort on lodgement date
epc_data = sorted(
epc_data, key=lambda x: datetime.strptime(x['lodgement-date'], '%Y-%m-%d'), reverse=True
)
days_since = [
abs((datetime.strptime(x["lodgement-date"], '%Y-%m-%d') - issued_date).days) for x in epc_data
]
# find the locaton of the closest
closest_index = [i for i, days in enumerate(days_since) if days == min(days_since)][0]
# Take just that epc record
epc_data = [epc_data[closest_index]]
model_data.append(
dict(
gross_area=area_config["gross_area"],
net_area=area_config["net_area"],
**epc_data[0]
)
)
# Save data
write_json_to_s3(
bucket_name=bucket,
file_name="wall-area-data/wall-area-model-data.json",
json_data=json.dumps(model_data)
)
def handler():
area_data = extract_areas()
combine_area_data(area_data)

View file

@ -1,2 +1,4 @@
boto3==1.28.38
PyPDF2==3.0.1
PyPDF2==3.0.1
pydantic==1.10.11
epc-api-python==1.0.2