adding a dummy testing script

2026-07-27 23:35:01 +00:00 · 2023-08-13 11:43:45 +00:00 · 2023-08-13 11:43:45 +00:00 · 54964bcf17
commit 54964bcf17
parent db67e0e23f
2 changed files with 68 additions and 6 deletions
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@ -3,7 +3,7 @@ import pandas as pd
 from tqdm import tqdm
 from model_data.BaseUtility import BaseUtility
 from pathlib import Path
-from simulation_system.Settings import (
+from model_data.simulation_system.Settings import (
    MANDATORY_FIXED_FEATURES,
    AVERAGE_FIXED_FEATURES, 
    LATEST_FIELD, 
@ -28,7 +28,6 @@ def app():

    dataset = []

-
    for directory in tqdm(directories): 

        filepath = directory / "certificates.csv"
@ -43,16 +42,16 @@ def app():
            # Fixed features - these are property attributes that shouldn't change over time
            fixed_data = {}

+            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
+            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
+                continue
+
            # Map all anomaly values to None
            data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
            
            # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
            modified_property_data = property_data.replace(data_anomaly_map)
            modified_property_data = modified_property_data.replace(np.NAN, None)
-
-            # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
-            if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
-                continue
            
            # Remap certain columns
            modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@ -0,0 +1,63 @@
+from pathlib import Path
+from Settings import (
+    RDSAP_RESPONSE, 
+    FLOOR_LEVEL_MAP, 
+    BUILT_FORM_REMAP,
+    EARLIEST_EPC_DATE, 
+    FULLY_GLAZED_DESCRIPTIONS
+    )
+from model_data.BaseUtility import BaseUtility
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+
+DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
+
+def main():
+    """
+    Extract all information to do a simple predictor for RDSAP
+    """
+
+    directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
+    directories = directories[0:10]
+    dfs = [] 
+    for directory in tqdm(directories):
+        filepath = directory / "certificates.csv"
+        df = pd.read_csv(filepath)
+
+        # Remove any bad uprns and ignore old/bad data
+        df = df[~pd.isnull(df["UPRN"])]
+        df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+        df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
+        df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+
+        # Change multi glaze proportion
+        no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+        df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
+
+        # Recast 
+        df["UPRN"] = df["UPRN"].astype(int).astype(str)
+        df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float)
+
+        # Sort Data
+        df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
+
+        # Map all anomaly values to None
+        data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
+        
+        # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
+        df = df.replace(data_anomaly_map)
+        df = df.replace(np.NAN, None)
+        
+        # Remap certain columns
+        df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
+        df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
+
+        dfs.append(df)
+
+    data = pd.concat(dfs)
+    data.to_parquet('./energy_predictor_data.parquet')
+
+
+if __name__ == "__main__":
+    main()