Adding in get_cleaned function

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-13 13:30:45 +01:00
parent 98b6261fe1
commit dd90065874
6 changed files with 69 additions and 5540 deletions

View file

@ -8,6 +8,7 @@ from backend.app.config import get_settings
from backend.Property import Property
from epc_api.client import EpcClient
from utils.logger import setup_logger
from utils.s3 import read_from_s3
from recommendations.FloorRecommendations import FloorRecommendations
from recommendations.WallRecommendations import WallRecommendations
from utils.uvalue_estimates import classify_decile_newvalues
@ -17,6 +18,7 @@ from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError, OperationalError
from datetime import datetime
import pandas as pd
import msgpack
# model apis
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
@ -45,7 +47,6 @@ from model_data.simulation_system.core.Settings import (
# TODO: This is placeholder until data is stored in DB
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
from backend.app.plan.temp_cleaned_data import cleaned
logger = setup_logger()
@ -138,6 +139,25 @@ def insert_temp_recommendation_id(property_recommendations):
return property_recommendations
def get_cleaned():
"""
This function will retrieve the cleaned dataset from s3 which has the cleaned
descriptions for the epc dataset
This data is stored in MessagePack format and therefore needs to be decoded
:return:
"""
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
)
cleaned = msgpack.unpackb(cleaned, raw=False)
return cleaned
def score_measures():
"""
This wrapper function prepares data to be passed to the sap model api
@ -220,8 +240,10 @@ async def trigger_plan(body: PlanTriggerRequest):
# table probably won't be very large and won't be updated that often. It might be better to
# store this data in s3 load it into memory when the app starts up. We will test this
logger.info("Reading in materials and cleaned datasets")
materials = get_materials(session)
materials_by_type = filter_materials(materials)
cleaned = get_cleaned()
logger.info("Getting components and properties recommendations")

File diff suppressed because it is too large Load diff

View file

@ -1,3 +1,4 @@
msgpack==1.0.5
anyio==3.7.1
cffi==1.15.1
click==8.1.3

View file

@ -7,7 +7,7 @@ from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path
from model_data.utils import save_data_to_s3
from utils.s3 import save_data_to_s3
LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",

View file

@ -1,6 +1,4 @@
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
import pandas as pd
from io import BytesIO
import re
from textblob import TextBlob
@ -48,45 +46,3 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
# Upload the Parquet file to S3
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
def save_data_to_s3(data, bucket_name, s3_file_name):
"""
Save an object to an S3 bucket
:param data: The data to save
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
try:
s3 = boto3.client('s3')
except NoCredentialsError:
print("Credentials not available.")
return
except PartialCredentialsError:
print("Incomplete credentials provided.")
return
try:
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
except Exception as e:
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
def read_from_s3(bucket_name, s3_file_name):
"""
Read an object from s3. Decoding of the data is left for outside of this function
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Initialize a session using Amazon S3
s3 = boto3.resource('s3')
# Get the MessagePack data from S3
obj = s3.Object(bucket_name, s3_file_name)
data = obj.get()['Body'].read()
return data

44
utils/s3.py Normal file
View file

@ -0,0 +1,44 @@
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
def read_from_s3(bucket_name, s3_file_name):
"""
Read an object from s3. Decoding of the data is left for outside of this function
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Initialize a session using Amazon S3
s3 = boto3.resource('s3')
# Get the MessagePack data from S3
obj = s3.Object(bucket_name, s3_file_name)
data = obj.get()['Body'].read()
return data
def save_data_to_s3(data, bucket_name, s3_file_name):
"""
Save an object to an S3 bucket
:param data: The data to save
:param bucket_name: The name of the S3 bucket
:param s3_file_name: The file name to use for the saved data in S3
"""
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
try:
s3 = boto3.client('s3')
except NoCredentialsError:
print("Credentials not available.")
return
except PartialCredentialsError:
print("Incomplete credentials provided.")
return
try:
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
except Exception as e:
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')