diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 28fce331..d4de6b71 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -18,3 +18,5 @@ statsmodels scikit-learn pyspellchecker textblob +boto3 +pyarrow diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index c1ca56a6..8962e252 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -3,7 +3,7 @@ import pandas as pd from tqdm import tqdm from pathlib import Path -from core.Settings import ( +from simulation_system.core.Settings import ( MANDATORY_FIXED_FEATURES, AVERAGE_FIXED_FEATURES, LATEST_FIELD, @@ -12,9 +12,10 @@ from core.Settings import ( HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, ) -from core.DataProcessor import DataProcessor +from simulation_system.core.DataProcessor import DataProcessor +from utils import save_dataframe_to_s3_parquet -DATA_DIRECTORY = Path(__file__).parent / "data" / "all-domestic-certificates" +DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates" # TODO: Have a look at temporal features @@ -28,8 +29,12 @@ def app(): # List all subdirectories directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] + # temp + needed = {'E09000030', 'E09000028', 'E09000013', 'E09000012', 'E09000014'} + directories = [d for d in directories if any(n in d.name for n in needed)] dataset = [] + cleaning_dataset = [] # 116 # 128048706 # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic @@ -107,7 +112,7 @@ def app(): variable_data = modified_property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] - ] + ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third @@ -133,10 +138,10 @@ def app(): starting_record = starting_record[ COMPONENT_FEATURES + ["LODGEMENT_DATE"] - ].add_suffix("_STARTING") + ].add_suffix("_STARTING") ending_record = ending_record[ COMPONENT_FEATURES + ["LODGEMENT_DATE"] - ].add_suffix("_ENDING") + ].add_suffix("_ENDING") features = pd.concat([starting_record, ending_record]) @@ -150,7 +155,18 @@ def app(): } ) - dataset.extend(property_model_data) + dataset.append(property_model_data) + + cleaning_averages["local-authority"] = df["LOCAL_AUTHORITY"].values[0] + cleaning_dataset.append(cleaning_averages) + + # Store cleaning dataset in s3 as a parquet file + cleaning_dataset = pd.concat(cleaning_dataset) + save_dataframe_to_s3_parquet( + df=cleaning_dataset, + bucket_name="retrofit-data-dev", + file_key="sap_change_model/cleaning_dataset.parquet", + ) output = pd.DataFrame(dataset) output.to_parquet("./dataset.parquet") diff --git a/model_data/utils.py b/model_data/utils.py index 744914a4..07642973 100644 --- a/model_data/utils.py +++ b/model_data/utils.py @@ -1,3 +1,6 @@ +import boto3 +import pandas as pd +from io import BytesIO import re from textblob import TextBlob @@ -24,3 +27,23 @@ def correct_spelling(text): corrected_text = ' '.join(corrected_words) return corrected_text + + +def save_dataframe_to_s3_parquet(df, bucket_name, file_key): + """ + Save a pandas DataFrame to S3 as a Parquet file. + + :param df: The pandas DataFrame. + :param bucket_name: Name of the S3 bucket. + :param file_key: Key of the file (including directory path within the bucket). + """ + + # Convert the DataFrame to a Parquet format in memory + parquet_buffer = BytesIO() + df.to_parquet(parquet_buffer) + + # Create the boto3 client + client = boto3.client('s3') + + # Upload the Parquet file to S3 + client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())