rebuilding epc clean

This commit is contained in:
Khalim Conn-Kowlessar 2025-08-09 23:10:59 +01:00
parent 3bdadc80df
commit 33ca9b7988
6 changed files with 41 additions and 99 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="epc_clean" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="epc_clean" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -3,6 +3,8 @@ import pandas as pd
from backend.Funding import Funding, EligibilityCaveats
from backend.tests.test_data.innovation_measure_fixtures import innovation_scenarios
from backend.tests.test_data.pre_heating_scenarios import pre_main_heating_scenarios
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
@pytest.fixture
@ -1040,99 +1042,18 @@ def test_map_to_pre_main_heating(scenario):
"expected"], f"Failed: {scenario['description']} -> {result} (expected {scenario['expected']})"
# Large scale testing for various measures
measures = [
{"type": "solar_pv", "is_innovation": True, "uplift": 0.45},
{"type": "internal_wall_insulation", "is_innovation": False, "uplift": 0},
{"type": "cavity_wall_insulation", "is_innovation": False, "uplift": 0},
{"type": "external_wall_insulation", "is_innovation": False, "uplift": 0},
{"type": "loft_insulation", "is_innovation": False, "uplift": 0},
{"type": "air_source_heat_pump", "is_innovation": False, "uplift": 0},
{"type": "double_glazing", "is_innovation": False, "uplift": 0},
{"type": "cavity_wall_insulation", "is_innovation": True, "uplift": 0.25},
{"type": "high_heat_retention_storage_heaters", "is_innovation": False, "uplift": 0},
]
epc_df = pd.read_csv(
"/Users/khalimconn-kowlessar/Downloads/domestic-E08000003-Manchester/certificates.csv"
)
from tqdm import tqdm
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
# TODO: Add innovation uplift to private
raise ValueError("TODO: ADD INNOVATION TO PRIVATE")
mock_project_scores_matrix = mock_project_scores_matrix()
mock_whlg_postcodes = mock_whlg_postcodes()
mock_partial_scores_matrix = mock_partial_scores_matrix()
errors = []
for _, x in tqdm(epc_df.iterrows(), total=len(epc_df)):
try:
# inputs
mainheat_energy_eff = x["MAINHEAT_ENERGY_EFF"]
heating_cleaner = MainHeatAttributes(description=x["MAINHEAT_DESCRIPTION"])
fuel_cleaner = MainFuelAttributes(description="" if pd.isnull(x["MAIN_FUEL"]) else x["MAIN_FUEL"])
h = heating_cleaner.process()
f = fuel_cleaner.process()
funding = Funding(
project_scores_matrix=mock_project_scores_matrix,
partial_project_scores_matrix=mock_partial_scores_matrix,
whlg_eligible_postcodes=mock_whlg_postcodes,
social_cavity_abs_rate=13.5,
social_solid_abs_rate=17,
private_cavity_abs_rate=13.5,
private_solid_abs_rate=17,
tenure="Social"
)
self = funding
measures = measures
starting_sap = 33
ending_sap = 69
floor_area = 71
mainheat_description = x["MAINHEAT_DESCRIPTION"]
heating_control_description = x["MAINHEATCONT_DESCRIPTION"]
is_cavity = True
current_wall_uvalue = 2
is_partial = False
existing_li_thickness = 0
has_wall_insulation_recommendation = True
has_roof_insulation_recommendation = True
mainheating = h
main_fuel = f
mainheat_energy_eff = mainheat_energy_eff
funding.check_funding(
measures=measures,
starting_sap=33,
ending_sap=69,
floor_area=71,
mainheat_description=x["MAINHEAT_DESCRIPTION"],
heating_control_description=x["MAINHEATCONT_DESCRIPTION"],
is_cavity=True,
current_wall_uvalue=2,
is_partial=False,
existing_li_thickness=0,
has_wall_insulation_recommendation=True,
has_roof_insulation_recommendation=True,
mainheating=h,
main_fuel=f,
mainheat_energy_eff=mainheat_energy_eff,
)
except Exception as e:
errors.append(x["LMK_KEY"])
errored_epcs = epc_df[epc_df["LMK_KEY"].isin(errors)]
unique_combs = errored_epcs[["MAINHEAT_ENERGY_EFF", "MAINHEAT_DESCRIPTION", "MAIN_FUEL"]].drop_duplicates()
i = 2
x = errored_epcs[
(errored_epcs["MAINHEAT_ENERGY_EFF"] == unique_combs["MAINHEAT_ENERGY_EFF"].values[i]) &
(errored_epcs["MAINHEAT_DESCRIPTION"] == unique_combs["MAINHEAT_DESCRIPTION"].values[i]) &
(errored_epcs["MAIN_FUEL"] == unique_combs["MAIN_FUEL"].values[i])
].head(1).squeeze()
most_prominent_combinations = epc_df.groupby(
["MAINHEAT_ENERGY_EFF", "MAINHEAT_DESCRIPTION", "MAIN_FUEL"]
)["LMK_KEY"].nunique().reset_index().sort_values("LMK_KEY", ascending=False).head(30).to_dict("records")
# Large scale testing for various measures
# measures = [
# {"type": "solar_pv", "is_innovation": True, "uplift": 0.45},
# {"type": "internal_wall_insulation", "is_innovation": False, "uplift": 0},
# {"type": "cavity_wall_insulation", "is_innovation": False, "uplift": 0},
# {"type": "external_wall_insulation", "is_innovation": False, "uplift": 0},
# {"type": "loft_insulation", "is_innovation": False, "uplift": 0},
# {"type": "air_source_heat_pump", "is_innovation": False, "uplift": 0},
# {"type": "double_glazing", "is_innovation": False, "uplift": 0},
# {"type": "cavity_wall_insulation", "is_innovation": True, "uplift": 0.25},
# {"type": "high_heat_retention_storage_heaters", "is_innovation": False, "uplift": 0},
# ]

View file

@ -3,11 +3,12 @@ import os
import pandas as pd
import msgpack
import inspect
from datetime import datetime
from etl.epc_clean.EpcClean import EpcClean
from etl.epc.settings import EARLIEST_EPC_DATE
from pathlib import Path
from utils.s3 import save_data_to_s3
from utils.s3 import save_data_to_s3, read_from_s3
src_file_path = inspect.getfile(lambda: None)
@ -22,7 +23,7 @@ LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(src_file_path)) + "/model_data/local_data/pp-2017-part2.csv",
]
EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates"
EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads") / "all-domestic-certificates"
ENVIRONMENT = os.getenv("ENVIRONMENT", "dev")
@ -74,6 +75,18 @@ def app():
# data being read in will be extremely small, meaning quicker load times. We'll begin by storing as a single
# file and monitor usage patterns to see if it makes sense to split the data up
# TODO: Copy the existing cleaned to an archive location, in case we wish to roll back easily
cleaned_historic = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name=f"retrofit-data-{ENVIRONMENT}"
)
cleaned_historic = msgpack.unpackb(cleaned_historic, raw=False)
save_data_to_s3(
data=msgpack.packb(cleaned_historic, use_bin_type=True),
s3_file_name=f"cleaned_epc_data/archive/{str(datetime.now())} - cleaned.bson",
bucket_name=f"retrofit-data-{ENVIRONMENT}"
)
save_data_to_s3(
data=msgpack.packb(cleaned_data, use_bin_type=True),
s3_file_name="cleaned_epc_data/cleaned.bson",

View file

@ -74,7 +74,10 @@ class MainHeatAttributes(Definitions):
"dim system ar gael, rhagdybir bod gwresogyddion trydan, trydan": "no system present, electric heaters assumed",
# Should be handled by edge cases
", trydan": ", electric",
'awyr gynnes, nwy prif gyflenwad': 'warm air, mains gas'
'awyr gynnes, nwy prif gyflenwad': 'warm air, mains gas',
"bwyler a rheiddiaduron, nwy prif gyflenwad, gwresogyddion ystafell, trydan": "Boiler and radiators, "
"mains gas, Room heaters, "
"electric"
}
REMAP = {

View file

@ -0,0 +1,5 @@
tqdm
pandas
msgpack
textblob
boto3