From 2e589373379610b6380482b5064ab1c80b5858b9 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 12 Sep 2023 11:45:03 +0100
Subject: [PATCH] Added cleaned data storage

---
 model_data/cleaner_app.py | 16 ++++++++++++----
 model_data/utils.py       | 26 ++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/model_data/cleaner_app.py b/model_data/cleaner_app.py
index 9656557b..9a7d25b4 100644
--- a/model_data/cleaner_app.py
+++ b/model_data/cleaner_app.py
@@ -1,14 +1,13 @@
 from tqdm import tqdm
 import os
 import pandas as pd
+import json
 
-from model_data.config import EPC_AUTH_TOKEN
-from epc_api.client import EpcClient
-from model_data.downloader import pagenated_epc_download
 from model_data.EpcClean import EpcClean
 from model_data.analysis.UvalueEstimations import UvalueEstimations
 from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
 from pathlib import Path
+from model_data.utils import save_json_to_s3
 
 LAND_REGISTRY_PATHS = [
     os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
@@ -23,13 +22,16 @@ LAND_REGISTRY_PATHS = [
 
 EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
 
+ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
+
 
 def app():
     """
     For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
     and produce a dataset of cleaned fields so that when we get new properties, we can quickly
     sanitise any description data
-    :return:
+
+    Currently, this application is just run on a local machine
     """
 
     cleaned_data = {}
@@ -68,3 +70,9 @@ def app():
         # uvalue_estimates.walls
         # uvalue_estimates.floors
         # uvalue_estimates.roofs
+
+    save_json_to_s3(
+        json_data=json.dumps(cleaned_data),
+        s3_file_name="cleaned_epc_data/cleaned.json",
+        bucket_name=f"retrofit-data-{ENVIRONMENT}"
+    )
diff --git a/model_data/utils.py b/model_data/utils.py
index 07642973..d0c2f330 100644
--- a/model_data/utils.py
+++ b/model_data/utils.py
@@ -1,4 +1,5 @@
 import boto3
+from botocore.exceptions import NoCredentialsError, PartialCredentialsError
 import pandas as pd
 from io import BytesIO
 import re
@@ -47,3 +48,28 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
 
     # Upload the Parquet file to S3
     client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
+
+
+def save_json_to_s3(json_data, bucket_name, s3_file_name):
+    """
+    Save a JSON object to an S3 bucket
+
+    :param json_data: The JSON data to save
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3
+    """
+    # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
+    try:
+        s3 = boto3.client('s3')
+    except NoCredentialsError:
+        print("Credentials not available.")
+        return
+    except PartialCredentialsError:
+        print("Incomplete credentials provided.")
+        return
+
+    try:
+        s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=json_data)
+        print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
+    except Exception as e:
+        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')