Adding in get_cleaned function

2026-07-27 23:35:01 +00:00 · 2023-09-13 13:30:45 +01:00 · 2023-09-13 13:30:45 +01:00 · dd90065874
commit dd90065874
parent 98b6261fe1
6 changed files with 69 additions and 5540 deletions
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@ -8,6 +8,7 @@ from backend.app.config import get_settings
 from backend.Property import Property
 from epc_api.client import EpcClient
 from utils.logger import setup_logger
+from utils.s3 import read_from_s3
 from recommendations.FloorRecommendations import FloorRecommendations
 from recommendations.WallRecommendations import WallRecommendations
 from utils.uvalue_estimates import classify_decile_newvalues
@ -17,6 +18,7 @@ from sqlalchemy.orm import sessionmaker
 from sqlalchemy.exc import IntegrityError, OperationalError
 from datetime import datetime
 import pandas as pd
+import msgpack

 # model apis
 from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
@ -45,7 +47,6 @@ from model_data.simulation_system.core.Settings import (
 # TODO: This is placeholder until data is stored in DB
 from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
 from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
-from backend.app.plan.temp_cleaned_data import cleaned

 logger = setup_logger()

@ -138,6 +139,25 @@ def insert_temp_recommendation_id(property_recommendations):
    return property_recommendations


+def get_cleaned():
+    """
+    This function will retrieve the cleaned dataset from s3 which has the cleaned
+    descriptions for the epc dataset
+
+    This data is stored in MessagePack format and therefore needs to be decoded
+    :return:
+    """
+
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
+    )
+
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+
+    return cleaned
+
+
 def score_measures():
    """
    This wrapper function prepares data to be passed to the sap model api
@ -220,8 +240,10 @@ async def trigger_plan(body: PlanTriggerRequest):
        #       table probably won't be very large and won't be updated that often. It might be better to
        #       store this data in s3 load it into memory when the app starts up. We will test this

+        logger.info("Reading in materials and cleaned datasets")
        materials = get_materials(session)
        materials_by_type = filter_materials(materials)
+        cleaned = get_cleaned()

        logger.info("Getting components and properties recommendations")

--- a/backend/app/plan/temp_cleaned_data.py
+++ b/backend/app/plan/temp_cleaned_data.py
--- a/backend/requirements/base.txt
+++ b/backend/requirements/base.txt
@ -1,3 +1,4 @@
+msgpack==1.0.5
 anyio==3.7.1
 cffi==1.15.1
 click==8.1.3
--- a/model_data/cleaner_app.py
+++ b/model_data/cleaner_app.py
@ -7,7 +7,7 @@ from model_data.EpcClean import EpcClean
 from model_data.analysis.UvalueEstimations import UvalueEstimations
 from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
 from pathlib import Path
-from model_data.utils import save_data_to_s3
+from utils.s3 import save_data_to_s3

 LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
--- a/model_data/utils.py
+++ b/model_data/utils.py
@ -1,6 +1,4 @@
 import boto3
-from botocore.exceptions import NoCredentialsError, PartialCredentialsError
-import pandas as pd
 from io import BytesIO
 import re
 from textblob import TextBlob
@ -48,45 +46,3 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):

    # Upload the Parquet file to S3
    client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
-
-
-def save_data_to_s3(data, bucket_name, s3_file_name):
-    """
-    Save an object to an S3 bucket
-
-    :param data: The data to save
-    :param bucket_name: The name of the S3 bucket
-    :param s3_file_name: The file name to use for the saved data in S3
-    """
-    # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
-    try:
-        s3 = boto3.client('s3')
-    except NoCredentialsError:
-        print("Credentials not available.")
-        return
-    except PartialCredentialsError:
-        print("Incomplete credentials provided.")
-        return
-
-    try:
-        s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
-        print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
-    except Exception as e:
-        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
-
-
-def read_from_s3(bucket_name, s3_file_name):
-    """
-    Read an object from s3. Decoding of the data is left for outside of this function
-
-    :param bucket_name: The name of the S3 bucket
-    :param s3_file_name: The file name to use for the saved data in S3
-    """
-    # Initialize a session using Amazon S3
-    s3 = boto3.resource('s3')
-
-    # Get the MessagePack data from S3
-    obj = s3.Object(bucket_name, s3_file_name)
-    data = obj.get()['Body'].read()
-
-    return data
--- a/utils/s3.py
+++ b/utils/s3.py
@ -0,0 +1,44 @@
+import boto3
+from botocore.exceptions import NoCredentialsError, PartialCredentialsError
+
+
+def read_from_s3(bucket_name, s3_file_name):
+    """
+    Read an object from s3. Decoding of the data is left for outside of this function
+
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3
+    """
+    # Initialize a session using Amazon S3
+    s3 = boto3.resource('s3')
+
+    # Get the MessagePack data from S3
+    obj = s3.Object(bucket_name, s3_file_name)
+    data = obj.get()['Body'].read()
+
+    return data
+
+
+def save_data_to_s3(data, bucket_name, s3_file_name):
+    """
+    Save an object to an S3 bucket
+
+    :param data: The data to save
+    :param bucket_name: The name of the S3 bucket
+    :param s3_file_name: The file name to use for the saved data in S3
+    """
+    # Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
+    try:
+        s3 = boto3.client('s3')
+    except NoCredentialsError:
+        print("Credentials not available.")
+        return
+    except PartialCredentialsError:
+        print("Incomplete credentials provided.")
+        return
+
+    try:
+        s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
+        print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
+    except Exception as e:
+        print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')