From 2e589373379610b6380482b5064ab1c80b5858b9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Sep 2023 11:45:03 +0100 Subject: [PATCH] Added cleaned data storage --- model_data/cleaner_app.py | 16 ++++++++++++---- model_data/utils.py | 26 ++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/model_data/cleaner_app.py b/model_data/cleaner_app.py index 9656557b..9a7d25b4 100644 --- a/model_data/cleaner_app.py +++ b/model_data/cleaner_app.py @@ -1,14 +1,13 @@ from tqdm import tqdm import os import pandas as pd +import json -from model_data.config import EPC_AUTH_TOKEN -from epc_api.client import EpcClient -from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.analysis.UvalueEstimations import UvalueEstimations from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE from pathlib import Path +from model_data.utils import save_json_to_s3 LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", @@ -23,13 +22,16 @@ LAND_REGISTRY_PATHS = [ EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" +ENVIRONMENT = os.getenv("ENVIRONMENT", "dev") + def app(): """ For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API and produce a dataset of cleaned fields so that when we get new properties, we can quickly sanitise any description data - :return: + + Currently, this application is just run on a local machine """ cleaned_data = {} @@ -68,3 +70,9 @@ def app(): # uvalue_estimates.walls # uvalue_estimates.floors # uvalue_estimates.roofs + + save_json_to_s3( + json_data=json.dumps(cleaned_data), + s3_file_name="cleaned_epc_data/cleaned.json", + bucket_name=f"retrofit-data-{ENVIRONMENT}" + ) diff --git a/model_data/utils.py b/model_data/utils.py index 07642973..d0c2f330 100644 --- a/model_data/utils.py +++ b/model_data/utils.py @@ -1,4 +1,5 @@ import boto3 +from botocore.exceptions import NoCredentialsError, PartialCredentialsError import pandas as pd from io import BytesIO import re @@ -47,3 +48,28 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key): # Upload the Parquet file to S3 client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue()) + + +def save_json_to_s3(json_data, bucket_name, s3_file_name): + """ + Save a JSON object to an S3 bucket + + :param json_data: The JSON data to save + :param bucket_name: The name of the S3 bucket + :param s3_file_name: The file name to use for the saved data in S3 + """ + # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles + try: + s3 = boto3.client('s3') + except NoCredentialsError: + print("Credentials not available.") + return + except PartialCredentialsError: + print("Incomplete credentials provided.") + return + + try: + s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=json_data) + print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}') + except Exception as e: + print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')