added creating of cleaning dataset and storage of cleaning data

2026-06-08 11:17:27 +00:00 · 2023-09-05 16:33:50 +01:00 · 2023-09-05 16:33:50 +01:00 · 0692140acf
commit 0692140acf
parent 637ada9cb3
3 changed files with 48 additions and 7 deletions
--- a/model_data/requirements/requirements.txt
+++ b/model_data/requirements/requirements.txt
@ -18,3 +18,5 @@ statsmodels
 scikit-learn
 pyspellchecker
 textblob
+boto3
+pyarrow
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -3,7 +3,7 @@ import pandas as pd
 from tqdm import tqdm

 from pathlib import Path
-from core.Settings import (
+from simulation_system.core.Settings import (
    MANDATORY_FIXED_FEATURES,
    AVERAGE_FIXED_FEATURES,
    LATEST_FIELD,
@ -12,9 +12,10 @@ from core.Settings import (
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
 )
-from core.DataProcessor import DataProcessor
+from simulation_system.core.DataProcessor import DataProcessor
+from utils import save_dataframe_to_s3_parquet

-DATA_DIRECTORY = Path(__file__).parent / "data" / "all-domestic-certificates"
+DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"


 # TODO: Have a look at temporal features
@ -28,8 +29,12 @@ def app():

    # List all subdirectories
    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
+    # temp
+    needed = {'E09000030', 'E09000028', 'E09000013', 'E09000012', 'E09000014'}
+    directories = [d for d in directories if any(n in d.name for n in needed)]

    dataset = []
+    cleaning_dataset = []
    # 116
    # 128048706
    # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
@ -107,7 +112,7 @@ def app():
            variable_data = modified_property_data[
                COMPONENT_FEATURES
                + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
-            ]
+                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
            # e.g. first vs second, second vs third and also first vs third
@ -133,10 +138,10 @@ def app():

                starting_record = starting_record[
                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
-                ].add_suffix("_STARTING")
+                    ].add_suffix("_STARTING")
                ending_record = ending_record[
                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
-                ].add_suffix("_ENDING")
+                    ].add_suffix("_ENDING")

                features = pd.concat([starting_record, ending_record])

@ -150,7 +155,18 @@ def app():
                    }
                )

-            dataset.extend(property_model_data)
+            dataset.append(property_model_data)
+
+        cleaning_averages["local-authority"] = df["LOCAL_AUTHORITY"].values[0]
+        cleaning_dataset.append(cleaning_averages)
+
+    # Store cleaning dataset in s3 as a parquet file
+    cleaning_dataset = pd.concat(cleaning_dataset)
+    save_dataframe_to_s3_parquet(
+        df=cleaning_dataset,
+        bucket_name="retrofit-data-dev",
+        file_key="sap_change_model/cleaning_dataset.parquet",
+    )

    output = pd.DataFrame(dataset)
    output.to_parquet("./dataset.parquet")
--- a/model_data/utils.py
+++ b/model_data/utils.py
@ -1,3 +1,6 @@
+import boto3
+import pandas as pd
+from io import BytesIO
 import re
 from textblob import TextBlob

@ -24,3 +27,23 @@ def correct_spelling(text):

    corrected_text = ' '.join(corrected_words)
    return corrected_text
+
+
+def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
+    """
+    Save a pandas DataFrame to S3 as a Parquet file.
+
+    :param df: The pandas DataFrame.
+    :param bucket_name: Name of the S3 bucket.
+    :param file_key: Key of the file (including directory path within the bucket).
+    """
+
+    # Convert the DataFrame to a Parquet format in memory
+    parquet_buffer = BytesIO()
+    df.to_parquet(parquet_buffer)
+
+    # Create the boto3 client
+    client = boto3.client('s3')
+
+    # Upload the Parquet file to S3
+    client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())