rebuilding epc clean

2026-07-27 23:35:01 +00:00 · 2025-08-09 23:10:59 +01:00 · 2025-08-09 23:10:59 +01:00 · 33ca9b7988
commit 33ca9b7988
parent 3bdadc80df
6 changed files with 41 additions and 99 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="epc_clean" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="epc_clean" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/backend/tests/test_funding.py
+++ b/backend/tests/test_funding.py
@ -3,6 +3,8 @@ import pandas as pd
 from backend.Funding import Funding, EligibilityCaveats
 from backend.tests.test_data.innovation_measure_fixtures import innovation_scenarios
 from backend.tests.test_data.pre_heating_scenarios import pre_main_heating_scenarios
+from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes


@pytest.fixture
@ -1040,99 +1042,18 @@ def test_map_to_pre_main_heating(scenario):
        "expected"], f"Failed: {scenario['description']} -> {result} (expected {scenario['expected']})"


-# Large scale testing for various measures
-measures = [
-    {"type": "solar_pv", "is_innovation": True, "uplift": 0.45},
-    {"type": "internal_wall_insulation", "is_innovation": False, "uplift": 0},
-    {"type": "cavity_wall_insulation", "is_innovation": False, "uplift": 0},
-    {"type": "external_wall_insulation", "is_innovation": False, "uplift": 0},
-    {"type": "loft_insulation", "is_innovation": False, "uplift": 0},
-    {"type": "air_source_heat_pump", "is_innovation": False, "uplift": 0},
-    {"type": "double_glazing", "is_innovation": False, "uplift": 0},
-    {"type": "cavity_wall_insulation", "is_innovation": True, "uplift": 0.25},
-    {"type": "high_heat_retention_storage_heaters", "is_innovation": False, "uplift": 0},
-]
-epc_df = pd.read_csv(
-    "/Users/khalimconn-kowlessar/Downloads/domestic-E08000003-Manchester/certificates.csv"
-)
-from tqdm import tqdm
-from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
-from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
-
 # TODO: Add innovation uplift to private
+raise ValueError("TODO: ADD INNOVATION TO PRIVATE")

-mock_project_scores_matrix = mock_project_scores_matrix()
-mock_whlg_postcodes = mock_whlg_postcodes()
-mock_partial_scores_matrix = mock_partial_scores_matrix()
-
-errors = []
-for _, x in tqdm(epc_df.iterrows(), total=len(epc_df)):
-    try:
-        # inputs
-        mainheat_energy_eff = x["MAINHEAT_ENERGY_EFF"]
-        heating_cleaner = MainHeatAttributes(description=x["MAINHEAT_DESCRIPTION"])
-        fuel_cleaner = MainFuelAttributes(description="" if pd.isnull(x["MAIN_FUEL"]) else x["MAIN_FUEL"])
-
-        h = heating_cleaner.process()
-        f = fuel_cleaner.process()
-
-        funding = Funding(
-            project_scores_matrix=mock_project_scores_matrix,
-            partial_project_scores_matrix=mock_partial_scores_matrix,
-            whlg_eligible_postcodes=mock_whlg_postcodes,
-            social_cavity_abs_rate=13.5,
-            social_solid_abs_rate=17,
-            private_cavity_abs_rate=13.5,
-            private_solid_abs_rate=17,
-            tenure="Social"
-        )
-
-        self = funding
-        measures = measures
-        starting_sap = 33
-        ending_sap = 69
-        floor_area = 71
-        mainheat_description = x["MAINHEAT_DESCRIPTION"]
-        heating_control_description = x["MAINHEATCONT_DESCRIPTION"]
-        is_cavity = True
-        current_wall_uvalue = 2
-        is_partial = False
-        existing_li_thickness = 0
-        has_wall_insulation_recommendation = True
-        has_roof_insulation_recommendation = True
-        mainheating = h
-        main_fuel = f
-        mainheat_energy_eff = mainheat_energy_eff
-
-        funding.check_funding(
-            measures=measures,
-            starting_sap=33,
-            ending_sap=69,
-            floor_area=71,
-            mainheat_description=x["MAINHEAT_DESCRIPTION"],
-            heating_control_description=x["MAINHEATCONT_DESCRIPTION"],
-            is_cavity=True,
-            current_wall_uvalue=2,
-            is_partial=False,
-            existing_li_thickness=0,
-            has_wall_insulation_recommendation=True,
-            has_roof_insulation_recommendation=True,
-            mainheating=h,
-            main_fuel=f,
-            mainheat_energy_eff=mainheat_energy_eff,
-        )
-    except Exception as e:
-        errors.append(x["LMK_KEY"])
-
-errored_epcs = epc_df[epc_df["LMK_KEY"].isin(errors)]
-unique_combs = errored_epcs[["MAINHEAT_ENERGY_EFF", "MAINHEAT_DESCRIPTION", "MAIN_FUEL"]].drop_duplicates()
-i = 2
-x = errored_epcs[
-    (errored_epcs["MAINHEAT_ENERGY_EFF"] == unique_combs["MAINHEAT_ENERGY_EFF"].values[i]) &
-    (errored_epcs["MAINHEAT_DESCRIPTION"] == unique_combs["MAINHEAT_DESCRIPTION"].values[i]) &
-    (errored_epcs["MAIN_FUEL"] == unique_combs["MAIN_FUEL"].values[i])
-    ].head(1).squeeze()
-
-most_prominent_combinations = epc_df.groupby(
-    ["MAINHEAT_ENERGY_EFF", "MAINHEAT_DESCRIPTION", "MAIN_FUEL"]
-)["LMK_KEY"].nunique().reset_index().sort_values("LMK_KEY", ascending=False).head(30).to_dict("records")
+# Large scale testing for various measures
+# measures = [
+#     {"type": "solar_pv", "is_innovation": True, "uplift": 0.45},
+#     {"type": "internal_wall_insulation", "is_innovation": False, "uplift": 0},
+#     {"type": "cavity_wall_insulation", "is_innovation": False, "uplift": 0},
+#     {"type": "external_wall_insulation", "is_innovation": False, "uplift": 0},
+#     {"type": "loft_insulation", "is_innovation": False, "uplift": 0},
+#     {"type": "air_source_heat_pump", "is_innovation": False, "uplift": 0},
+#     {"type": "double_glazing", "is_innovation": False, "uplift": 0},
+#     {"type": "cavity_wall_insulation", "is_innovation": True, "uplift": 0.25},
+#     {"type": "high_heat_retention_storage_heaters", "is_innovation": False, "uplift": 0},
+# ]
--- a/etl/epc_clean/app.py
+++ b/etl/epc_clean/app.py
@ -3,11 +3,12 @@ import os
 import pandas as pd
 import msgpack
 import inspect
+from datetime import datetime

 from etl.epc_clean.EpcClean import EpcClean
 from etl.epc.settings import EARLIEST_EPC_DATE
 from pathlib import Path
-from utils.s3 import save_data_to_s3
+from utils.s3 import save_data_to_s3, read_from_s3

 src_file_path = inspect.getfile(lambda: None)

@ -22,7 +23,7 @@ LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(src_file_path)) + "/model_data/local_data/pp-2017-part2.csv",
 ]

-EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
+EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads") / "all-domestic-certificates"

 ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")

@ -74,6 +75,18 @@ def app():
    # data being read in will be extremely small, meaning quicker load times. We'll begin by storing as a single
    # file and monitor usage patterns to see if it makes sense to split the data up

+    # TODO: Copy the existing cleaned to an archive location, in case we wish to roll back easily
+    cleaned_historic = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name=f"retrofit-data-{ENVIRONMENT}"
+    )
+    cleaned_historic = msgpack.unpackb(cleaned_historic, raw=False)
+    save_data_to_s3(
+        data=msgpack.packb(cleaned_historic, use_bin_type=True),
+        s3_file_name=f"cleaned_epc_data/archive/{str(datetime.now())} - cleaned.bson",
+        bucket_name=f"retrofit-data-{ENVIRONMENT}"
+    )
+
    save_data_to_s3(
        data=msgpack.packb(cleaned_data, use_bin_type=True),
        s3_file_name="cleaned_epc_data/cleaned.bson",
--- a/etl/epc_clean/epc_attributes/MainheatAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py
@ -74,7 +74,10 @@ class MainHeatAttributes(Definitions):
        "dim system ar gael, rhagdybir bod gwresogyddion trydan, trydan": "no system present, electric heaters assumed",
        # Should be handled by edge cases
        ", trydan": ", electric",
-        'awyr gynnes, nwy prif gyflenwad': 'warm air, mains gas'
+        'awyr gynnes, nwy prif gyflenwad': 'warm air, mains gas',
+        "bwyler a rheiddiaduron, nwy prif gyflenwad, gwresogyddion ystafell, trydan": "Boiler and radiators, "
+                                                                                      "mains gas, Room heaters, "
+                                                                                      "electric"
    }

    REMAP = {
--- a/etl/epc_clean/requirements.txt
+++ b/etl/epc_clean/requirements.txt
@ -0,0 +1,5 @@
+tqdm
+pandas
+msgpack
+textblob
+boto3