mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
basic process for creating of area dataset complete
This commit is contained in:
parent
2ee9ba9ddd
commit
3a2a45ba9b
2 changed files with 90 additions and 3 deletions
|
|
@ -2,13 +2,17 @@
|
|||
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
|
||||
of insulation measures within homes
|
||||
"""
|
||||
import os
|
||||
import boto3
|
||||
import PyPDF2
|
||||
import re
|
||||
import json
|
||||
from epc_api.client import EpcClient
|
||||
from io import BytesIO
|
||||
from datetime import datetime
|
||||
|
||||
bucket = "retrofit-datalake-dev"
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None)
|
||||
|
||||
|
||||
def list_files_in_s3_folder(bucket_name, folder_name):
|
||||
|
|
@ -173,7 +177,10 @@ def check_page(pages, page_num, page_index):
|
|||
return True
|
||||
|
||||
|
||||
def handler():
|
||||
def extract_areas():
|
||||
"""
|
||||
This function extracts the net and gross wall areas from the pdf sap calculation files
|
||||
"""
|
||||
files = list_files_in_s3_folder(bucket, "full_sap_calculations")
|
||||
|
||||
# get pdfs
|
||||
|
|
@ -215,6 +222,10 @@ def handler():
|
|||
property_reference_number = pdf_pages[pagenum][page_idx + 2]
|
||||
address = pdf_pages[pagenum][page_idx + 4]
|
||||
|
||||
# Search for issued date - the date appears in the field before
|
||||
_, date_pagenum, date_page_idx = search_pages(pdf_pages, 'Issued on Date')
|
||||
issued_date = pdf_pages[date_pagenum][date_page_idx + -1]
|
||||
|
||||
data.append(
|
||||
{
|
||||
"property_reference": property_reference,
|
||||
|
|
@ -222,8 +233,82 @@ def handler():
|
|||
"address": address,
|
||||
"gross_area": gross_area,
|
||||
"net_area": net_area,
|
||||
"filename": sap_calculation_file
|
||||
"filename": sap_calculation_file,
|
||||
"issued_date": issued_date,
|
||||
}
|
||||
)
|
||||
|
||||
write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data))
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def combine_area_data(area_data=None):
|
||||
"""
|
||||
This function will merge the area data onto additional features which are
|
||||
:param area_data: list of dictionaries, containing the areas and the
|
||||
"""
|
||||
|
||||
epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
|
||||
|
||||
model_data = []
|
||||
for area_config in area_data:
|
||||
address = area_config["address"]
|
||||
|
||||
# The addresses seems to have this structure:
|
||||
# <house number>, <street name>, <area (optional)>, <postcode>"
|
||||
# Where area is not always in the address
|
||||
address_destructured = address.split(",")
|
||||
house_number = address_destructured[0].strip()
|
||||
street_name = address_destructured[1].strip()
|
||||
postcode = address_destructured[-1].strip()
|
||||
|
||||
# Fetch epc data
|
||||
epc_response = epc_client.domestic.search(
|
||||
params={
|
||||
"postcode": postcode,
|
||||
"address": ", ".join([house_number, street_name]),
|
||||
}
|
||||
)
|
||||
|
||||
epc_data = epc_response["rows"]
|
||||
if len(epc_data) == 0:
|
||||
raise ValueError("No EPC data - investigate me")
|
||||
|
||||
if len(epc_data) > 1:
|
||||
issued_date = datetime.strptime(area_config["issued_date"], '%d/%m/%Y')
|
||||
# We get the epc data closest to the issued date. On the edge case that we have two EPC records that are
|
||||
# equally far away from the issued_date, we take the most recent EPC record
|
||||
# We sort on lodgement date
|
||||
epc_data = sorted(
|
||||
epc_data, key=lambda x: datetime.strptime(x['lodgement-date'], '%Y-%m-%d'), reverse=True
|
||||
)
|
||||
|
||||
days_since = [
|
||||
abs((datetime.strptime(x["lodgement-date"], '%Y-%m-%d') - issued_date).days) for x in epc_data
|
||||
]
|
||||
# find the locaton of the closest
|
||||
closest_index = [i for i, days in enumerate(days_since) if days == min(days_since)][0]
|
||||
# Take just that epc record
|
||||
epc_data = [epc_data[closest_index]]
|
||||
|
||||
model_data.append(
|
||||
dict(
|
||||
gross_area=area_config["gross_area"],
|
||||
net_area=area_config["net_area"],
|
||||
**epc_data[0]
|
||||
)
|
||||
)
|
||||
|
||||
# Save data
|
||||
write_json_to_s3(
|
||||
bucket_name=bucket,
|
||||
file_name="wall-area-data/wall-area-model-data.json",
|
||||
json_data=json.dumps(model_data)
|
||||
)
|
||||
|
||||
|
||||
def handler():
|
||||
area_data = extract_areas()
|
||||
|
||||
combine_area_data(area_data)
|
||||
|
|
|
|||
|
|
@ -1,2 +1,4 @@
|
|||
boto3==1.28.38
|
||||
PyPDF2==3.0.1
|
||||
PyPDF2==3.0.1
|
||||
pydantic==1.10.11
|
||||
epc-api-python==1.0.2
|
||||
Loading…
Add table
Reference in a new issue