From 974b049ad194ddba0ee35fdb3b0c9834fabb8976 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Aug 2023 10:09:08 +0100 Subject: [PATCH 1/4] Added area_data file --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- model_data/simulation_system/area_data.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 model_data/simulation_system/area_data.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..b03b31b1 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ca0e1cd9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py new file mode 100644 index 00000000..603b71c9 --- /dev/null +++ b/model_data/simulation_system/area_data.py @@ -0,0 +1,4 @@ +""" +This script produces the dataset used to model the wall area of properties, which is used to estimate the cost +of insulation measures within homes +""" From 20ba7149c148962700a23026ab946b708ff12615 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Aug 2023 13:45:06 +0100 Subject: [PATCH 2/4] wip on area data handler --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- model_data/simulation_system/area_data.py | 80 +++++++++++++++++++ .../requirements/area_data.txt | 2 + 4 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 model_data/simulation_system/requirements/area_data.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index b03b31b1..44faa37d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index ca0e1cd9..f0144d5b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py index 603b71c9..ee74012b 100644 --- a/model_data/simulation_system/area_data.py +++ b/model_data/simulation_system/area_data.py @@ -2,3 +2,83 @@ This script produces the dataset used to model the wall area of properties, which is used to estimate the cost of insulation measures within homes """ +import boto3 +import PyPDF2 +import tempfile + +bucket = "retrofit-datalake-dev" + + +def list_files_in_s3_folder(bucket_name, folder_name): + """ + List files in a specific S3 bucket and folder. + + Parameters: + - bucket_name: Name of the S3 bucket. + - folder_name: Name of the folder (prefix) within the bucket. + + Returns: + - A list of file names within the specified folder. + """ + + # Ensure folder name ends with a '/' + if not folder_name.endswith('/'): + folder_name += '/' + + s3_client = boto3.client('s3') + + # Initialize empty list to store file names + files = [] + + # Initialize paginator + paginator = s3_client.get_paginator('list_objects_v2') + + # Create a PageIterator from the Paginator + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name) + + for page in page_iterator: + # Extract file names from the current page and append to the list + files.extend([item['Key'] for item in page.get('Contents', [])]) + + return files + + +def fetch_pdf_from_s3(bucket_name, pdf_key, local_path): + """ + Fetch a PDF from an S3 bucket and save it locally. + + Parameters: + - bucket_name: Name of the S3 bucket. + - pdf_key: Path (key) of the PDF file within the bucket. + - local_path: Local path where the PDF should be saved. + """ + + s3_client = boto3.client('s3') + response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key) + + # Read the PDF bytes and save locally + with open(local_path, 'wb') as f: + f.write(response['Body'].read()) + + +# Usage +bucket_name = 'YOUR_BUCKET_NAME' +pdf_key = 'path/to/your/pdf_file.pdf' +local_path = 'local_file_name.pdf' +fetch_pdf_from_s3(bucket_name, pdf_key, local_path) + + +def handler(): + files = list_files_in_s3_folder(bucket, "full_sap_calculations") + + # get pdfs + sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")] + + # For each pdf, we pull out the net & gross wall areas + + data = [] + for sap_calculation_file in sap_calulation_pdfs: + # Create a temp file to store the PDF + temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name + + pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename) diff --git a/model_data/simulation_system/requirements/area_data.txt b/model_data/simulation_system/requirements/area_data.txt new file mode 100644 index 00000000..f6bff53c --- /dev/null +++ b/model_data/simulation_system/requirements/area_data.txt @@ -0,0 +1,2 @@ +boto3==1.28.38 +PyPDF2==3.0.1 \ No newline at end of file From 2ee9ba9dddb114a5565e4f1e891695c4e29e674d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Sep 2023 10:36:39 +0100 Subject: [PATCH 3/4] Implemented area data extraction for first 6 files --- model_data/simulation_system/area_data.py | 179 ++++++++++++++++++++-- 1 file changed, 162 insertions(+), 17 deletions(-) diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py index ee74012b..e381b0e5 100644 --- a/model_data/simulation_system/area_data.py +++ b/model_data/simulation_system/area_data.py @@ -4,7 +4,9 @@ of insulation measures within homes """ import boto3 import PyPDF2 -import tempfile +import re +import json +from io import BytesIO bucket = "retrofit-datalake-dev" @@ -43,29 +45,132 @@ def list_files_in_s3_folder(bucket_name, folder_name): return files -def fetch_pdf_from_s3(bucket_name, pdf_key, local_path): +def fetch_and_parse_pdf_from_s3(bucket_name, filename): """ - Fetch a PDF from an S3 bucket and save it locally. + Fetch a PDF from an S3 bucket and parse its content. Parameters: - bucket_name: Name of the S3 bucket. - pdf_key: Path (key) of the PDF file within the bucket. - - local_path: Local path where the PDF should be saved. + + Returns: + - text: Extracted text from the PDF. """ s3_client = boto3.client('s3') - response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key) + response = s3_client.get_object(Bucket=bucket_name, Key=filename) - # Read the PDF bytes and save locally - with open(local_path, 'wb') as f: - f.write(response['Body'].read()) + # Create a BytesIO object from the PDF bytes + pdf_content = BytesIO(response['Body'].read()) + + # Use PyPDF2 to read the PDF content + reader = PyPDF2.PdfReader(pdf_content) + + # Extract text from each page + pages = [] + for page_num in range(len(reader.pages)): + page = reader.pages[page_num] + + text = page.extract_text() + text = remove_excess_newlines(text) + pages.append(text.split("\n")) + + return pages -# Usage -bucket_name = 'YOUR_BUCKET_NAME' -pdf_key = 'path/to/your/pdf_file.pdf' -local_path = 'local_file_name.pdf' -fetch_pdf_from_s3(bucket_name, pdf_key, local_path) +def fetch_json_from_s3(bucket_name, file_name): + # Create an S3 client + s3 = boto3.client('s3') + + # Fetch the file from S3 + response = s3.get_object(Bucket=bucket_name, Key=file_name) + + # Parse and return the JSON data + return json.loads(response['Body'].read().decode('utf-8')) + + +def write_json_to_s3(bucket_name, file_name, json_data): + """ + Write JSON data to a file in an S3 bucket. + + Parameters: + - bucket_name: Name of the S3 bucket. + - file_name: Path (key) of the file within the bucket. + - json_data: JSON data to be saved. + """ + + s3_client = boto3.client('s3') + + # Convert the JSON data to a string + json_string = json.dumps(json_data) + + # Upload the JSON string to S3 + s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=json_string) + + +def check_s3_file_exists(bucket_name, file_name): + """ + Check if a file exists in an S3 bucket. + + Parameters: + - bucket_name: Name of the S3 bucket. + - file_name: Path (key) of the file within the bucket. + + Returns: + - bool: True if the file exists, False otherwise. + """ + + s3_client = boto3.client('s3') + + try: + # Check if the object exists by attempting to retrieve its metadata + s3_client.head_object(Bucket=bucket_name, Key=file_name) + return True + except s3_client.exceptions.ClientError as e: + # If the error code is 404 (Not Found), then the file doesn't exist + if e.response['Error']['Code'] == '404': + return False + # If there's any other exception, raise it + raise + + +def remove_excess_newlines(text): + return re.sub('\n+', '\n', text).strip() + + +def search_pages(pages, search_term) -> ( + str | None, int | None, int | None +): + """ + This method looks for a search term in the EPR and returns the first instance of it + :param pages: list of pages to search through + :param search_term: The term to search for + :return: The text, page number and page index of the first instance of the search term + """ + + to_page = len(pages) + from_page = 0 + from_index = 0 + + for page_num in range(from_page, to_page + 1): + + page_to_index = len(pages[page_num]) + + for page_index in range(from_index, page_to_index): + if search_term in pages[page_num][page_index]: + return pages[page_num][page_index], page_num, page_index + + return None, None, None + + +def check_page(pages, page_num, page_index): + if page_num > len(pages): + return False + + if page_index > len(pages[page_num]): + return False + + return True def handler(): @@ -75,10 +180,50 @@ def handler(): sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")] # For each pdf, we pull out the net & gross wall areas + if check_s3_file_exists(bucket_name=bucket, file_name="wall-area-data/wall-area.json"): + data = fetch_json_from_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json") + data = json.loads(data) + else: + data = [] + + used_files = [x["filename"] for x in data] + + sap_calulation_pdfs = [filename for filename in sap_calulation_pdfs if filename.split("/")[-1] not in used_files] - data = [] for sap_calculation_file in sap_calulation_pdfs: - # Create a temp file to store the PDF - temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name - pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename) + # Download pdf + pdf_pages = fetch_and_parse_pdf_from_s3(bucket, sap_calculation_file) + + # We search for net and gross wall areas + result = search_pages(pdf_pages, "External walls Main")[0] + # This is a row in a table where the columns are: + # Element, Gross, Openings, NetArea, U-value, A x U, K-value, A x K + # The values we're interested in are Gross and NetArea + values = result.split("External walls Main")[1].strip().split(" ") + # Remove the empty white space - we should now have the fields we want + values = [v for v in values if v] + gross_area = float(values[0]) + net_area = float(values[2]) + + # Search for property identifiers + _, pagenum, page_idx = search_pages(pdf_pages, 'Prop Type Ref') + if pagenum != 0: + raise ValueError("Property reference not found on the first page") + # the reference will be on the next line + property_reference = pdf_pages[pagenum][page_idx + 1] + property_reference_number = pdf_pages[pagenum][page_idx + 2] + address = pdf_pages[pagenum][page_idx + 4] + + data.append( + { + "property_reference": property_reference, + "reference_number": property_reference_number, + "address": address, + "gross_area": gross_area, + "net_area": net_area, + "filename": sap_calculation_file + } + ) + + write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data)) From 3a2a45ba9bc8de1555865fec8eac44823210b084 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 4 Sep 2023 12:08:55 +0100 Subject: [PATCH 4/4] basic process for creating of area dataset complete --- model_data/simulation_system/area_data.py | 89 ++++++++++++++++++- .../requirements/area_data.txt | 4 +- 2 files changed, 90 insertions(+), 3 deletions(-) diff --git a/model_data/simulation_system/area_data.py b/model_data/simulation_system/area_data.py index e381b0e5..f43093f1 100644 --- a/model_data/simulation_system/area_data.py +++ b/model_data/simulation_system/area_data.py @@ -2,13 +2,17 @@ This script produces the dataset used to model the wall area of properties, which is used to estimate the cost of insulation measures within homes """ +import os import boto3 import PyPDF2 import re import json +from epc_api.client import EpcClient from io import BytesIO +from datetime import datetime bucket = "retrofit-datalake-dev" +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", None) def list_files_in_s3_folder(bucket_name, folder_name): @@ -173,7 +177,10 @@ def check_page(pages, page_num, page_index): return True -def handler(): +def extract_areas(): + """ + This function extracts the net and gross wall areas from the pdf sap calculation files + """ files = list_files_in_s3_folder(bucket, "full_sap_calculations") # get pdfs @@ -215,6 +222,10 @@ def handler(): property_reference_number = pdf_pages[pagenum][page_idx + 2] address = pdf_pages[pagenum][page_idx + 4] + # Search for issued date - the date appears in the field before + _, date_pagenum, date_page_idx = search_pages(pdf_pages, 'Issued on Date') + issued_date = pdf_pages[date_pagenum][date_page_idx + -1] + data.append( { "property_reference": property_reference, @@ -222,8 +233,82 @@ def handler(): "address": address, "gross_area": gross_area, "net_area": net_area, - "filename": sap_calculation_file + "filename": sap_calculation_file, + "issued_date": issued_date, } ) write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data)) + + return data + + +def combine_area_data(area_data=None): + """ + This function will merge the area data onto additional features which are + :param area_data: list of dictionaries, containing the areas and the + """ + + epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + model_data = [] + for area_config in area_data: + address = area_config["address"] + + # The addresses seems to have this structure: + # , , , " + # Where area is not always in the address + address_destructured = address.split(",") + house_number = address_destructured[0].strip() + street_name = address_destructured[1].strip() + postcode = address_destructured[-1].strip() + + # Fetch epc data + epc_response = epc_client.domestic.search( + params={ + "postcode": postcode, + "address": ", ".join([house_number, street_name]), + } + ) + + epc_data = epc_response["rows"] + if len(epc_data) == 0: + raise ValueError("No EPC data - investigate me") + + if len(epc_data) > 1: + issued_date = datetime.strptime(area_config["issued_date"], '%d/%m/%Y') + # We get the epc data closest to the issued date. On the edge case that we have two EPC records that are + # equally far away from the issued_date, we take the most recent EPC record + # We sort on lodgement date + epc_data = sorted( + epc_data, key=lambda x: datetime.strptime(x['lodgement-date'], '%Y-%m-%d'), reverse=True + ) + + days_since = [ + abs((datetime.strptime(x["lodgement-date"], '%Y-%m-%d') - issued_date).days) for x in epc_data + ] + # find the locaton of the closest + closest_index = [i for i, days in enumerate(days_since) if days == min(days_since)][0] + # Take just that epc record + epc_data = [epc_data[closest_index]] + + model_data.append( + dict( + gross_area=area_config["gross_area"], + net_area=area_config["net_area"], + **epc_data[0] + ) + ) + + # Save data + write_json_to_s3( + bucket_name=bucket, + file_name="wall-area-data/wall-area-model-data.json", + json_data=json.dumps(model_data) + ) + + +def handler(): + area_data = extract_areas() + + combine_area_data(area_data) diff --git a/model_data/simulation_system/requirements/area_data.txt b/model_data/simulation_system/requirements/area_data.txt index f6bff53c..321d52e8 100644 --- a/model_data/simulation_system/requirements/area_data.txt +++ b/model_data/simulation_system/requirements/area_data.txt @@ -1,2 +1,4 @@ boto3==1.28.38 -PyPDF2==3.0.1 \ No newline at end of file +PyPDF2==3.0.1 +pydantic==1.10.11 +epc-api-python==1.0.2 \ No newline at end of file