mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added cleaned data storage
This commit is contained in:
parent
4f02b86efb
commit
2e58937337
2 changed files with 38 additions and 4 deletions
|
|
@ -1,14 +1,13 @@
|
|||
from tqdm import tqdm
|
||||
import os
|
||||
import pandas as pd
|
||||
import json
|
||||
|
||||
from model_data.config import EPC_AUTH_TOKEN
|
||||
from epc_api.client import EpcClient
|
||||
from model_data.downloader import pagenated_epc_download
|
||||
from model_data.EpcClean import EpcClean
|
||||
from model_data.analysis.UvalueEstimations import UvalueEstimations
|
||||
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
from model_data.utils import save_json_to_s3
|
||||
|
||||
LAND_REGISTRY_PATHS = [
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
|
||||
|
|
@ -23,13 +22,16 @@ LAND_REGISTRY_PATHS = [
|
|||
|
||||
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
||||
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
|
||||
sanitise any description data
|
||||
:return:
|
||||
|
||||
Currently, this application is just run on a local machine
|
||||
"""
|
||||
|
||||
cleaned_data = {}
|
||||
|
|
@ -68,3 +70,9 @@ def app():
|
|||
# uvalue_estimates.walls
|
||||
# uvalue_estimates.floors
|
||||
# uvalue_estimates.roofs
|
||||
|
||||
save_json_to_s3(
|
||||
json_data=json.dumps(cleaned_data),
|
||||
s3_file_name="cleaned_epc_data/cleaned.json",
|
||||
bucket_name=f"retrofit-data-{ENVIRONMENT}"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import boto3
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
import pandas as pd
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
|
@ -47,3 +48,28 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
|||
|
||||
# Upload the Parquet file to S3
|
||||
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
|
||||
|
||||
|
||||
def save_json_to_s3(json_data, bucket_name, s3_file_name):
|
||||
"""
|
||||
Save a JSON object to an S3 bucket
|
||||
|
||||
:param json_data: The JSON data to save
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_name: The file name to use for the saved data in S3
|
||||
"""
|
||||
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
|
||||
try:
|
||||
s3 = boto3.client('s3')
|
||||
except NoCredentialsError:
|
||||
print("Credentials not available.")
|
||||
return
|
||||
except PartialCredentialsError:
|
||||
print("Incomplete credentials provided.")
|
||||
return
|
||||
|
||||
try:
|
||||
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=json_data)
|
||||
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
|
||||
except Exception as e:
|
||||
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue