Implemented area data extraction for first 6 files

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-04 10:36:39 +01:00
parent 20ba7149c1
commit 2ee9ba9ddd

View file

@ -4,7 +4,9 @@ of insulation measures within homes
"""
import boto3
import PyPDF2
import tempfile
import re
import json
from io import BytesIO
bucket = "retrofit-datalake-dev"
@ -43,29 +45,132 @@ def list_files_in_s3_folder(bucket_name, folder_name):
return files
def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
def fetch_and_parse_pdf_from_s3(bucket_name, filename):
"""
Fetch a PDF from an S3 bucket and save it locally.
Fetch a PDF from an S3 bucket and parse its content.
Parameters:
- bucket_name: Name of the S3 bucket.
- pdf_key: Path (key) of the PDF file within the bucket.
- local_path: Local path where the PDF should be saved.
Returns:
- text: Extracted text from the PDF.
"""
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)
response = s3_client.get_object(Bucket=bucket_name, Key=filename)
# Read the PDF bytes and save locally
with open(local_path, 'wb') as f:
f.write(response['Body'].read())
# Create a BytesIO object from the PDF bytes
pdf_content = BytesIO(response['Body'].read())
# Use PyPDF2 to read the PDF content
reader = PyPDF2.PdfReader(pdf_content)
# Extract text from each page
pages = []
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
text = page.extract_text()
text = remove_excess_newlines(text)
pages.append(text.split("\n"))
return pages
# Usage
bucket_name = 'YOUR_BUCKET_NAME'
pdf_key = 'path/to/your/pdf_file.pdf'
local_path = 'local_file_name.pdf'
fetch_pdf_from_s3(bucket_name, pdf_key, local_path)
def fetch_json_from_s3(bucket_name, file_name):
# Create an S3 client
s3 = boto3.client('s3')
# Fetch the file from S3
response = s3.get_object(Bucket=bucket_name, Key=file_name)
# Parse and return the JSON data
return json.loads(response['Body'].read().decode('utf-8'))
def write_json_to_s3(bucket_name, file_name, json_data):
"""
Write JSON data to a file in an S3 bucket.
Parameters:
- bucket_name: Name of the S3 bucket.
- file_name: Path (key) of the file within the bucket.
- json_data: JSON data to be saved.
"""
s3_client = boto3.client('s3')
# Convert the JSON data to a string
json_string = json.dumps(json_data)
# Upload the JSON string to S3
s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=json_string)
def check_s3_file_exists(bucket_name, file_name):
"""
Check if a file exists in an S3 bucket.
Parameters:
- bucket_name: Name of the S3 bucket.
- file_name: Path (key) of the file within the bucket.
Returns:
- bool: True if the file exists, False otherwise.
"""
s3_client = boto3.client('s3')
try:
# Check if the object exists by attempting to retrieve its metadata
s3_client.head_object(Bucket=bucket_name, Key=file_name)
return True
except s3_client.exceptions.ClientError as e:
# If the error code is 404 (Not Found), then the file doesn't exist
if e.response['Error']['Code'] == '404':
return False
# If there's any other exception, raise it
raise
def remove_excess_newlines(text):
return re.sub('\n+', '\n', text).strip()
def search_pages(pages, search_term) -> (
str | None, int | None, int | None
):
"""
This method looks for a search term in the EPR and returns the first instance of it
:param pages: list of pages to search through
:param search_term: The term to search for
:return: The text, page number and page index of the first instance of the search term
"""
to_page = len(pages)
from_page = 0
from_index = 0
for page_num in range(from_page, to_page + 1):
page_to_index = len(pages[page_num])
for page_index in range(from_index, page_to_index):
if search_term in pages[page_num][page_index]:
return pages[page_num][page_index], page_num, page_index
return None, None, None
def check_page(pages, page_num, page_index):
if page_num > len(pages):
return False
if page_index > len(pages[page_num]):
return False
return True
def handler():
@ -75,10 +180,50 @@ def handler():
sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]
# For each pdf, we pull out the net & gross wall areas
if check_s3_file_exists(bucket_name=bucket, file_name="wall-area-data/wall-area.json"):
data = fetch_json_from_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json")
data = json.loads(data)
else:
data = []
used_files = [x["filename"] for x in data]
sap_calulation_pdfs = [filename for filename in sap_calulation_pdfs if filename.split("/")[-1] not in used_files]
data = []
for sap_calculation_file in sap_calulation_pdfs:
# Create a temp file to store the PDF
temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name
pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)
# Download pdf
pdf_pages = fetch_and_parse_pdf_from_s3(bucket, sap_calculation_file)
# We search for net and gross wall areas
result = search_pages(pdf_pages, "External walls Main")[0]
# This is a row in a table where the columns are:
# Element, Gross, Openings, NetArea, U-value, A x U, K-value, A x K
# The values we're interested in are Gross and NetArea
values = result.split("External walls Main")[1].strip().split(" ")
# Remove the empty white space - we should now have the fields we want
values = [v for v in values if v]
gross_area = float(values[0])
net_area = float(values[2])
# Search for property identifiers
_, pagenum, page_idx = search_pages(pdf_pages, 'Prop Type Ref')
if pagenum != 0:
raise ValueError("Property reference not found on the first page")
# the reference will be on the next line
property_reference = pdf_pages[pagenum][page_idx + 1]
property_reference_number = pdf_pages[pagenum][page_idx + 2]
address = pdf_pages[pagenum][page_idx + 4]
data.append(
{
"property_reference": property_reference,
"reference_number": property_reference_number,
"address": address,
"gross_area": gross_area,
"net_area": net_area,
"filename": sap_calculation_file
}
)
write_json_to_s3(bucket_name=bucket, file_name="wall-area-data/wall-area.json", json_data=json.dumps(data))