added creating of cleaning dataset and storage of cleaning data

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-05 16:33:50 +01:00
parent 637ada9cb3
commit 0692140acf
3 changed files with 48 additions and 7 deletions

View file

@ -18,3 +18,5 @@ statsmodels
scikit-learn
pyspellchecker
textblob
boto3
pyarrow

View file

@ -3,7 +3,7 @@ import pandas as pd
from tqdm import tqdm
from pathlib import Path
from core.Settings import (
from simulation_system.core.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
@ -12,9 +12,10 @@ from core.Settings import (
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
)
from core.DataProcessor import DataProcessor
from simulation_system.core.DataProcessor import DataProcessor
from utils import save_dataframe_to_s3_parquet
DATA_DIRECTORY = Path(__file__).parent / "data" / "all-domestic-certificates"
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
# TODO: Have a look at temporal features
@ -28,8 +29,12 @@ def app():
# List all subdirectories
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
# temp
needed = {'E09000030', 'E09000028', 'E09000013', 'E09000012', 'E09000014'}
directories = [d for d in directories if any(n in d.name for n in needed)]
dataset = []
cleaning_dataset = []
# 116
# 128048706
# PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
@ -107,7 +112,7 @@ def app():
variable_data = modified_property_data[
COMPONENT_FEATURES
+ ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
# e.g. first vs second, second vs third and also first vs third
@ -133,10 +138,10 @@ def app():
starting_record = starting_record[
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
].add_suffix("_STARTING")
].add_suffix("_STARTING")
ending_record = ending_record[
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
].add_suffix("_ENDING")
].add_suffix("_ENDING")
features = pd.concat([starting_record, ending_record])
@ -150,7 +155,18 @@ def app():
}
)
dataset.extend(property_model_data)
dataset.append(property_model_data)
cleaning_averages["local-authority"] = df["LOCAL_AUTHORITY"].values[0]
cleaning_dataset.append(cleaning_averages)
# Store cleaning dataset in s3 as a parquet file
cleaning_dataset = pd.concat(cleaning_dataset)
save_dataframe_to_s3_parquet(
df=cleaning_dataset,
bucket_name="retrofit-data-dev",
file_key="sap_change_model/cleaning_dataset.parquet",
)
output = pd.DataFrame(dataset)
output.to_parquet("./dataset.parquet")

View file

@ -1,3 +1,6 @@
import boto3
import pandas as pd
from io import BytesIO
import re
from textblob import TextBlob
@ -24,3 +27,23 @@ def correct_spelling(text):
corrected_text = ' '.join(corrected_words)
return corrected_text
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
"""
Save a pandas DataFrame to S3 as a Parquet file.
:param df: The pandas DataFrame.
:param bucket_name: Name of the S3 bucket.
:param file_key: Key of the file (including directory path within the bucket).
"""
# Convert the DataFrame to a Parquet format in memory
parquet_buffer = BytesIO()
df.to_parquet(parquet_buffer)
# Create the boto3 client
client = boto3.client('s3')
# Upload the Parquet file to S3
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())