mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added creating of cleaning dataset and storage of cleaning data
This commit is contained in:
parent
637ada9cb3
commit
0692140acf
3 changed files with 48 additions and 7 deletions
|
|
@ -18,3 +18,5 @@ statsmodels
|
|||
scikit-learn
|
||||
pyspellchecker
|
||||
textblob
|
||||
boto3
|
||||
pyarrow
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import pandas as pd
|
|||
from tqdm import tqdm
|
||||
|
||||
from pathlib import Path
|
||||
from core.Settings import (
|
||||
from simulation_system.core.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
|
|
@ -12,9 +12,10 @@ from core.Settings import (
|
|||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
)
|
||||
from core.DataProcessor import DataProcessor
|
||||
from simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils import save_dataframe_to_s3_parquet
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "data" / "all-domestic-certificates"
|
||||
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
# TODO: Have a look at temporal features
|
||||
|
|
@ -28,8 +29,12 @@ def app():
|
|||
|
||||
# List all subdirectories
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
# temp
|
||||
needed = {'E09000030', 'E09000028', 'E09000013', 'E09000012', 'E09000014'}
|
||||
directories = [d for d in directories if any(n in d.name for n in needed)]
|
||||
|
||||
dataset = []
|
||||
cleaning_dataset = []
|
||||
# 116
|
||||
# 128048706
|
||||
# PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
|
||||
|
|
@ -107,7 +112,7 @@ def app():
|
|||
variable_data = modified_property_data[
|
||||
COMPONENT_FEATURES
|
||||
+ ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||
]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
# e.g. first vs second, second vs third and also first vs third
|
||||
|
|
@ -133,10 +138,10 @@ def app():
|
|||
|
||||
starting_record = starting_record[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
|
||||
].add_suffix("_STARTING")
|
||||
].add_suffix("_STARTING")
|
||||
ending_record = ending_record[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
|
||||
].add_suffix("_ENDING")
|
||||
].add_suffix("_ENDING")
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
|
|
@ -150,7 +155,18 @@ def app():
|
|||
}
|
||||
)
|
||||
|
||||
dataset.extend(property_model_data)
|
||||
dataset.append(property_model_data)
|
||||
|
||||
cleaning_averages["local-authority"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
cleaning_dataset.append(cleaning_averages)
|
||||
|
||||
# Store cleaning dataset in s3 as a parquet file
|
||||
cleaning_dataset = pd.concat(cleaning_dataset)
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=cleaning_dataset,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
output = pd.DataFrame(dataset)
|
||||
output.to_parquet("./dataset.parquet")
|
||||
|
|
|
|||
|
|
@ -1,3 +1,6 @@
|
|||
import boto3
|
||||
import pandas as pd
|
||||
from io import BytesIO
|
||||
import re
|
||||
from textblob import TextBlob
|
||||
|
||||
|
|
@ -24,3 +27,23 @@ def correct_spelling(text):
|
|||
|
||||
corrected_text = ' '.join(corrected_words)
|
||||
return corrected_text
|
||||
|
||||
|
||||
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
||||
"""
|
||||
Save a pandas DataFrame to S3 as a Parquet file.
|
||||
|
||||
:param df: The pandas DataFrame.
|
||||
:param bucket_name: Name of the S3 bucket.
|
||||
:param file_key: Key of the file (including directory path within the bucket).
|
||||
"""
|
||||
|
||||
# Convert the DataFrame to a Parquet format in memory
|
||||
parquet_buffer = BytesIO()
|
||||
df.to_parquet(parquet_buffer)
|
||||
|
||||
# Create the boto3 client
|
||||
client = boto3.client('s3')
|
||||
|
||||
# Upload the Parquet file to S3
|
||||
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue