wip on area data handler

This commit is contained in:
Khalim Conn-Kowlessar 2023-08-31 13:45:06 +01:00
parent 58f476f59f
commit 20ba7149c1
4 changed files with 84 additions and 2 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (area_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (area_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -2,3 +2,83 @@
This script produces the dataset used to model the wall area of properties, which is used to estimate the cost
of insulation measures within homes
"""
import boto3
import PyPDF2
import tempfile
bucket = "retrofit-datalake-dev"
def list_files_in_s3_folder(bucket_name, folder_name):
"""
List files in a specific S3 bucket and folder.
Parameters:
- bucket_name: Name of the S3 bucket.
- folder_name: Name of the folder (prefix) within the bucket.
Returns:
- A list of file names within the specified folder.
"""
# Ensure folder name ends with a '/'
if not folder_name.endswith('/'):
folder_name += '/'
s3_client = boto3.client('s3')
# Initialize empty list to store file names
files = []
# Initialize paginator
paginator = s3_client.get_paginator('list_objects_v2')
# Create a PageIterator from the Paginator
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_name)
for page in page_iterator:
# Extract file names from the current page and append to the list
files.extend([item['Key'] for item in page.get('Contents', [])])
return files
def fetch_pdf_from_s3(bucket_name, pdf_key, local_path):
"""
Fetch a PDF from an S3 bucket and save it locally.
Parameters:
- bucket_name: Name of the S3 bucket.
- pdf_key: Path (key) of the PDF file within the bucket.
- local_path: Local path where the PDF should be saved.
"""
s3_client = boto3.client('s3')
response = s3_client.get_object(Bucket=bucket_name, Key=pdf_key)
# Read the PDF bytes and save locally
with open(local_path, 'wb') as f:
f.write(response['Body'].read())
# Usage
bucket_name = 'YOUR_BUCKET_NAME'
pdf_key = 'path/to/your/pdf_file.pdf'
local_path = 'local_file_name.pdf'
fetch_pdf_from_s3(bucket_name, pdf_key, local_path)
def handler():
files = list_files_in_s3_folder(bucket, "full_sap_calculations")
# get pdfs
sap_calulation_pdfs = [file for file in files if file.endswith(".pdf")]
# For each pdf, we pull out the net & gross wall areas
data = []
for sap_calculation_file in sap_calulation_pdfs:
# Create a temp file to store the PDF
temp_filename = tempfile.NamedTemporaryFile(suffix=".pdf").name
pdf_file = fetch_pdf_from_s3(bucket, sap_calculation_file, temp_filename)

View file

@ -0,0 +1,2 @@
boto3==1.28.38
PyPDF2==3.0.1