From 395ab0e083899403dad4978343936a4078315905 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 13 Dec 2025 22:13:09 +0800 Subject: [PATCH] minor debugging --- asset_list/app.py | 87 +++++--- asset_list/mappings/built_form.py | 8 +- .../db/functions/recommendations_functions.py | 157 +++++++++++++-- backend/engine/engine.py | 4 +- backend/ml_models/AnnualBillSavings.py | 8 + backend/tests/test_integration.py | 188 ++++++++++-------- etl/customers/lincs_rural/get_missed.py | 47 +++++ .../{data_prep.py => a_data_prep.py} | 0 .../b_data_cleanse.py | 147 ++++++++++++++ .../c_finalised_modelling_data.py | 114 +++++++++++ .../data_cleanse.py | 6 - etl/epc/Dataset.py | 2 +- etl/webscrape/Zoopla.py | 152 +++++++------- sfr/principal_pitch/2_export_data.py | 19 +- 14 files changed, 722 insertions(+), 217 deletions(-) create mode 100644 etl/customers/lincs_rural/get_missed.py rename etl/customers/peabody/Nov 2025 Consulting Project/{data_prep.py => a_data_prep.py} (100%) create mode 100644 etl/customers/peabody/Nov 2025 Consulting Project/b_data_cleanse.py create mode 100644 etl/customers/peabody/Nov 2025 Consulting Project/c_finalised_modelling_data.py delete mode 100644 etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py diff --git a/asset_list/app.py b/asset_list/app.py index dfd7aa46..3d8a0fae 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -59,25 +59,26 @@ def app(): Property UPRN """ - # Lambeth: - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" - data_filename = "lambeth_sw2_leigham court estate.xlsx" + # Peabody data for cleaning + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation") + data_filename = "to_standardise_uprns.xlsx" sheet_name = "Sheet1" postcode_column = 'Postcode' - address1_column = "Address" + address1_column = "Address 1" address1_method = None fulladdress_column = None - address_cols_to_concat = ["Address"] + address_cols_to_concat = ["Address 1", "Address 2", "Address 3"] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None + landlord_property_type = "Type" + landlord_built_form = "Attachment" landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "row_id" + landlord_property_id = "Org Ref" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -93,6 +94,40 @@ def app(): asset_list_header = 0 landlord_block_reference = None + # Lambeth: + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" + # data_filename = "lambeth_sw2_leigham court estate.xlsx" + # sheet_name = "Sheet1" + # postcode_column = 'Postcode' + # address1_column = "Address" + # address1_method = None + # fulladdress_column = None + # address_cols_to_concat = ["Address"] + # missing_postcodes_method = None + # landlord_year_built = None + # landlord_os_uprn = None + # landlord_property_type = None + # landlord_built_form = None + # landlord_wall_construction = None + # landlord_roof_construction = None + # landlord_heating_system = None + # landlord_existing_pv = None + # landlord_property_id = "row_id" + # landlord_sap = None + # outcomes_filename = None + # outcomes_sheetname = None + # outcomes_postcode = None + # outcomes_houseno = None + # outcomes_id = None + # outcomes_address = None + # master_filepaths = [] + # master_id_colnames = [] + # master_to_asset_list_filepath = None + # phase = False + # ecosurv_landlords = None + # asset_list_header = 0 + # landlord_block_reference = None + # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -230,22 +265,22 @@ def app(): ) # We now retrieve any failed properties - chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] - epc_data_failed, _, _ = get_data( - df=chunk_failed, - row_id_name=asset_list.DOMNA_PROPERTY_ID, - uprn_column=AssetList.STANDARD_UPRN, - fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, - address1_column=AssetList.STANDARD_ADDRESS_1, - postcode_column=AssetList.STANDARD_POSTCODE, - property_type_column=AssetList.STANDARD_PROPERTY_TYPE, - built_form_column=AssetList.STANDARD_BUILT_FORM, - manual_uprn_map=manual_uprn_map, - epc_api_only=epc_api_only, - epc_auth_token=EPC_AUTH_TOKEN - ) - - epc_data_chunk.extend(epc_data_failed) + # chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] + # epc_data_failed, _, _ = get_data( + # df=chunk_failed, + # row_id_name=asset_list.DOMNA_PROPERTY_ID, + # uprn_column=AssetList.STANDARD_UPRN, + # fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, + # address1_column=AssetList.STANDARD_ADDRESS_1, + # postcode_column=AssetList.STANDARD_POSTCODE, + # property_type_column=AssetList.STANDARD_PROPERTY_TYPE, + # built_form_column=AssetList.STANDARD_BUILT_FORM, + # manual_uprn_map=manual_uprn_map, + # epc_api_only=epc_api_only, + # epc_auth_token=EPC_AUTH_TOKEN + # ) + # + # epc_data_chunk.extend(epc_data_failed) # Append the failed data to the main data # Store the chunk locally as a csv @@ -422,3 +457,7 @@ def app(): if not asset_list.geographical_areas.empty: asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False) + + # Store dupes + if not asset_list.duplicated_addresses.empty: + asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False) diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py index 2556d755..58686d6b 100644 --- a/asset_list/mappings/built_form.py +++ b/asset_list/mappings/built_form.py @@ -458,6 +458,12 @@ BUILT_FORM_MAPPINGS = { 'Maisonette: Detached: Mid Floor': 'detached', 'Bungalow: EnclosedMidTerrace': 'enclosed mid-terrace', - 'House: EnclosedMidTerrace': 'enclosed mid-terrace' + 'House: EnclosedMidTerrace': 'enclosed mid-terrace', + + 'EnclosedMidTerrace': 'enclosed mid-terrace', + 'EnclosedEndTerrace': 'enclosed end-terrace', + 'EndTerrace': 'end-terrace', + 'SemiDetached': 'semi-detached', + 'MidTerrace': 'mid-terrace' } diff --git a/backend/app/db/functions/recommendations_functions.py b/backend/app/db/functions/recommendations_functions.py index 57ccf65c..14596749 100644 --- a/backend/app/db/functions/recommendations_functions.py +++ b/backend/app/db/functions/recommendations_functions.py @@ -1,4 +1,4 @@ -from sqlalchemy import insert, delete, text +from sqlalchemy import insert, delete, select from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError from backend.app.db.models.recommendations import ( @@ -242,20 +242,26 @@ def chunked(iterable, size=100): yield iterable[i:i + size] +# def fast_delete_recommendations(session, chunk): +# placeholders = ",".join(["(:p{})".format(i) for i in range(len(chunk))]) +# params = {f"p{i}": chunk[i] for i in range(len(chunk))} +# +# sql = text(f""" +# WITH ids(property_id) AS ( +# VALUES {placeholders} +# ) +# DELETE FROM recommendation r +# USING ids +# WHERE r.property_id = ids.property_id; +# """) +# +# session.execute(sql, params, execution_options={"synchronize_session": False}) + def fast_delete_recommendations(session, chunk): - placeholders = ",".join(["(:p{})".format(i) for i in range(len(chunk))]) - params = {f"p{i}": chunk[i] for i in range(len(chunk))} - - sql = text(f""" - WITH ids(property_id) AS ( - VALUES {placeholders} - ) - DELETE FROM recommendation r - USING ids - WHERE r.property_id = ids.property_id; - """) - - session.execute(sql, params, execution_options={"synchronize_session": False}) + session.execute( + delete(Recommendation) + .where(Recommendation.property_id.in_(chunk)) + ) def clear_portfolio(session: Session, portfolio_id: int, batch_size=100): @@ -362,11 +368,19 @@ def clear_portfolio(session: Session, portfolio_id: int, batch_size=100): # -------------------------- # Recommendations (fast delete) # -------------------------- - rec_chunks = list(chunked(property_ids, batch_size)) + # rec_chunks = list(chunked(property_ids, batch_size * 5)) # larger chunks for fast delete + # total = len(rec_chunks) + # for i, chunk in enumerate(rec_chunks, start=1): + # print_progress("Deleting Recommendations", i, total) + # fast_delete_recommendations(session, chunk) + rec_chunks = list(chunked(recommendation_ids, batch_size)) total = len(rec_chunks) for i, chunk in enumerate(rec_chunks, start=1): print_progress("Deleting Recommendations", i, total) - fast_delete_recommendations(session, chunk) + session.execute( + delete(Recommendation) + .where(Recommendation.id.in_(chunk)) + ) # -------------------------- # Inspections @@ -412,3 +426,114 @@ def clear_portfolio(session: Session, portfolio_id: int, batch_size=100): session.commit() print("Portfolio cleared.") + + +def clear_portfolio_in_batches( + session: Session, + portfolio_id: int, + property_batch_size: int = 10 +): + # Fetch all property IDs once + property_ids = [ + pid for (pid,) in + session.query(PropertyModel.id) + .filter(PropertyModel.portfolio_id == portfolio_id) + .all() + ] + + def delete_for_property_batch(prop_ids): + # ---------------------------- + # Recommendations → PlanRecommendations + # ---------------------------- + rec_subq = ( + select(Recommendation.id) + .where(Recommendation.property_id.in_(prop_ids)) + ) + + session.execute( + delete(PlanRecommendations) + .where(PlanRecommendations.recommendation_id.in_(rec_subq)) + ) + + session.execute( + delete(RecommendationMaterials) + .where(RecommendationMaterials.recommendation_id.in_(rec_subq)) + ) + + session.execute( + delete(Recommendation) + .where(Recommendation.property_id.in_(prop_ids)) + ) + + # ---------------------------- + # Inspections + # ---------------------------- + session.execute( + delete(InspectionModel) + .where(InspectionModel.property_id.in_(prop_ids)) + ) + + # ---------------------------- + # Plans (scoped to these properties) + # ---------------------------- + plan_subq = ( + select(Plan.id) + .where(Plan.property_id.in_(prop_ids)) + ) + + session.execute( + delete(PlanRecommendations) + .where(PlanRecommendations.plan_id.in_(plan_subq)) + ) + + session.execute( + delete(FundingPackageMeasures) + .where( + FundingPackageMeasures.funding_package_id.in_( + select(FundingPackage.id) + .where(FundingPackage.plan_id.in_(plan_subq)) + ) + ) + ) + + session.execute( + delete(FundingPackage) + .where(FundingPackage.plan_id.in_(plan_subq)) + ) + + session.execute( + delete(Plan) + .where(Plan.id.in_(plan_subq)) + ) + + # ---------------------------- + # Property-scoped auxiliary tables + # ---------------------------- + session.execute( + delete(PropertyDetailsEpcModel) + .where(PropertyDetailsEpcModel.property_id.in_(prop_ids)) + ) + + session.execute( + delete(PropertyTargetsModel) + .where(PropertyTargetsModel.property_id.in_(prop_ids)) + ) + + # ---------------------------- + # Properties (last) + # ---------------------------- + session.execute( + delete(PropertyModel) + .where(PropertyModel.id.in_(prop_ids)) + ) + + # -------- BATCH DELETE LOOP -------- + property_chunks = list(chunked(property_ids, property_batch_size)) + total_batches = len(property_chunks) + + for i, prop_ids in enumerate(property_chunks, start=1): + print(f"Deleting batch {i}/{total_batches} ({len(prop_ids)} properties)") + delete_for_property_batch(prop_ids) + session.commit() + + print("Portfolio cleared in batches.") diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 1f2b3976..967d6b16 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -662,7 +662,9 @@ async def model_engine(body: PlanTriggerRequest): address1 = config.get("domna_address_1", None) address1 = str(int(address1)) if isinstance(address1, float) else str(address1) - full_address = config.get("domna_full_address") if body.file_format == "domna_asset_list" else None + full_address = config.get("domna_full_address", "") if body.file_format == "domna_asset_list" else None + if not isinstance(full_address, str): # Catch for when the full address is nan + full_address = None heating_system = parse_heating_system(config) associated_uprns = [] diff --git a/backend/ml_models/AnnualBillSavings.py b/backend/ml_models/AnnualBillSavings.py index 27d1b5be..f04ee2f1 100644 --- a/backend/ml_models/AnnualBillSavings.py +++ b/backend/ml_models/AnnualBillSavings.py @@ -290,6 +290,14 @@ class AnnualBillSavings: # The solar thermal covers a % of the heating kwh, so we need to adjust the cost return (kwh / cop) * assumptions.SOLAR_CONSUMPTION_PROPORTION * cls.ELECTRICITY_PRICE_CAP + if fuel in ['Oil + Solar Thermal']: + # The solar thermal covers a % of the heating kwh, so we need to adjust the cost + price_data = cls.FUEL_DATA[cls.FUEL_DATA["Fuel"] == "Kerosene"].squeeze() + cost_per_kwh = cls.cost_per_kwh( + price_data["Price (p)"], price_data["Energy Content, Net Calorific value (kWh/unit)"] + ) + return (kwh / cop) * cost_per_kwh * assumptions.SOLAR_CONSUMPTION_PROPORTION + if fuel == "LPG + Solar Thermal": # The solar thermal covers a % of the heating kwh, so we need to adjust the cost price_data = cls.FUEL_DATA[cls.FUEL_DATA["Fuel"] == "LPG"].squeeze() diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py index 45dd109a..cdc27abd 100644 --- a/backend/tests/test_integration.py +++ b/backend/tests/test_integration.py @@ -82,6 +82,12 @@ costs_by_floor_area = epc_data[ ][["TOTAL_FLOOR_AREA", "CURRENT_ENERGY_EFFICIENCY", "LIGHTING_COST_CURRENT", "HEATING_COST_CURRENT", "HOT_WATER_COST_CURRENT"]].copy() +epc_data = epc_data[ + (epc_data["MAINHEAT_DESCRIPTION"].str.contains("SAP05:") == False) & + (~epc_data["LIGHTING_COST_CURRENT"].isin([None, ""])) & + (~pd.isnull(epc_data["LIGHTING_COST_CURRENT"])) + ] + costs_by_floor_area.columns = [c.lower().replace("_", "-") for c in costs_by_floor_area.columns] for c in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]: costs_by_floor_area[c + "_scaled"] = costs_by_floor_area[c] / costs_by_floor_area["total-floor-area"] @@ -92,8 +98,8 @@ costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[ epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] -sample_epc_data = epc_data[pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2015-01-01"].drop_duplicates("UPRN").sample( - 10000).reset_index(drop=True) +sample_epc_data = epc_data[pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2008-01-01"].drop_duplicates("UPRN").sample( + 50000).reset_index(drop=True) # TODO: In Property find_energy_sources, sort out biomass community heating - what fuel type # TODO: We might be able to remove find_energy_sources entirely and remove estimate_electrical_consumption. It's used @@ -163,6 +169,8 @@ mocked_kwh_predictions["heating_kwh_predictions"] = pd.DataFrame(mocked_kwh_pred mocked_kwh_predictions["hotwater_kwh_predictions"] = pd.DataFrame(mocked_kwh_predictions["hotwater_kwh_predictions"]) # TODO: We might want to implement this generally, via an ETL process +for x in cleaned["mainheat-description"]: + x["has_wood_chips"] = False for p in input_properties: for col in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]: if pd.isnull(p.data[col]): @@ -313,6 +321,10 @@ for p in tqdm(input_properties): if not recommendations.get(p.id): continue + # Temp allow to skip + if not isinstance(recommendations.get(p.id)[0], list): + continue + # we need to double unlist because we have a list of lists property_measure_types = {rec["type"] for recs in recommendations[p.id] for rec in recs} property_required_measures = [m for m in recommendations[p.id] if m[0]["type"] in body.required_measures] @@ -336,32 +348,32 @@ for p in tqdm(input_properties): ) gain = optimiser_functions.calculate_gain(body=body, p=p, fixed_gain=fixed_gain, eco_packages=eco_packages) - funding = Funding( - tenure=body.housing_type, - project_scores_matrix=project_scores_matrix, - partial_project_scores_matrix=partial_project_scores_matrix, - whlg_eligible_postcodes=whlg_eligible_postcodes, - eco4_social_cavity_abs_rate=13, - eco4_social_solid_abs_rate=17, - eco4_private_cavity_abs_rate=13, - eco4_private_solid_abs_rate=17, - gbis_social_cavity_abs_rate=21, - gbis_social_solid_abs_rate=25, - gbis_private_cavity_abs_rate=21, - gbis_private_solid_abs_rate=28, - ) - - li_thickness = convert_thickness_to_numeric( - p.roof["insulation_thickness"], p.roof["is_pitched"], p.roof["is_flat"] - ) - current_wall_u_value = p.walls["thermal_transmittance"] - if current_wall_u_value is None: - current_wall_u_value = get_wall_u_value( - clean_description=p.walls["clean_description"], - age_band=p.age_band, - is_granite_or_whinstone=p.walls["is_granite_or_whinstone"], - is_sandstone_or_limestone=p.walls["is_sandstone_or_limestone"], - ) + # funding = Funding( + # tenure=body.housing_type, + # project_scores_matrix=project_scores_matrix, + # partial_project_scores_matrix=partial_project_scores_matrix, + # whlg_eligible_postcodes=whlg_eligible_postcodes, + # eco4_social_cavity_abs_rate=13, + # eco4_social_solid_abs_rate=17, + # eco4_private_cavity_abs_rate=13, + # eco4_private_solid_abs_rate=17, + # gbis_social_cavity_abs_rate=21, + # gbis_social_solid_abs_rate=25, + # gbis_private_cavity_abs_rate=21, + # gbis_private_solid_abs_rate=28, + # ) + # + # li_thickness = convert_thickness_to_numeric( + # p.roof["insulation_thickness"], p.roof["is_pitched"], p.roof["is_flat"] + # ) + # current_wall_u_value = p.walls["thermal_transmittance"] + # if current_wall_u_value is None: + # current_wall_u_value = get_wall_u_value( + # clean_description=p.walls["clean_description"], + # age_band=p.age_band, + # is_granite_or_whinstone=p.walls["is_granite_or_whinstone"], + # is_sandstone_or_limestone=p.walls["is_sandstone_or_limestone"], + # ) # We insert the innovation uplift measures_to_optimise_with_uplift = deepcopy(measures_to_optimise) @@ -369,35 +381,39 @@ for p in tqdm(input_properties): # TODO: Turn this into a function and store the innovaiton uplift for group in measures_to_optimise_with_uplift: for r in group: - - if r["type"] in ["mechanical_ventilation", "low_energy_lighting", "secondary_heating", - "extension_cavity_wall_insulation", "draught_proofing", "sealing_open_fireplace"]: - ( - r["partial_project_score"], - r["partial_project_funding"], - r["innovation_uplift"], - r["uplift_project_score"], - ) = ( - 0, 0, 0, 0 - ) - continue - - ( - r["partial_project_score"], r["partial_project_funding"], r["innovation_uplift"], - r["uplift_project_score"] - ) = funding.get_innovation_uplift( - measure=r, - starting_sap=int(p.data["current-energy-efficiency"]), - floor_area=p.floor_area, - is_cavity=p.walls["is_cavity_wall"], - current_wall_uvalue=current_wall_u_value, - is_partial="partial" in p.walls["clean_description"].lower(), - existing_li_thickness=li_thickness, - mainheating=p.main_heating, - main_fuel=p.main_fuel, - mainheat_energy_eff=p.data["mainheat-energy-eff"], + (r["partial_project_score"], r["partial_project_funding"], r["innovation_uplift"], + r["uplift_project_score"]) = ( + 0, 0, 0, 0 ) + # if r["type"] in ["mechanical_ventilation", "low_energy_lighting", "secondary_heating", + # "extension_cavity_wall_insulation", "draught_proofing", "sealing_open_fireplace"]: + # ( + # r["partial_project_score"], + # r["partial_project_funding"], + # r["innovation_uplift"], + # r["uplift_project_score"], + # ) = ( + # 0, 0, 0, 0 + # ) + # continue + # + # ( + # r["partial_project_score"], r["partial_project_funding"], r["innovation_uplift"], + # r["uplift_project_score"] + # ) = funding.get_innovation_uplift( + # measure=r, + # starting_sap=int(p.data["current-energy-efficiency"]), + # floor_area=p.floor_area, + # is_cavity=p.walls["is_cavity_wall"], + # current_wall_uvalue=current_wall_u_value, + # is_partial="partial" in p.walls["clean_description"].lower(), + # existing_li_thickness=li_thickness, + # mainheating=p.main_heating, + # main_fuel=p.main_fuel, + # mainheat_energy_eff=p.data["mainheat-energy-eff"], + # ) + if r["already_installed"]: # if already installed, we zero out the uplift and funding (r["partial_project_score"], r["partial_project_funding"], r["innovation_uplift"], @@ -411,7 +427,7 @@ for p in tqdm(input_properties): ) # When the goal is Increasing EPC, we can run the funding optimiser - if body.goal == "Increasing EPC": + if body.goal == "Switch off": solutions = optimise_with_funding_paths( p=p, @@ -481,37 +497,43 @@ for p in tqdm(input_properties): ROOF_INSULATION_MEASURES ) - funding.check_funding( - measures=solution, - starting_sap=int(p.data["current-energy-efficiency"]), - ending_sap=int(p.data["current-energy-efficiency"]) + sum([x["gain"] for x in solution]), - floor_area=p.floor_area, - mainheat_description=p.main_heating["clean_description"], - heating_control_description=p.main_heating_controls["clean_description"], - is_cavity=p.walls["is_cavity_wall"], - current_wall_uvalue=current_wall_u_value, - is_partial="partial" in p.walls["clean_description"].lower(), - existing_li_thickness=li_thickness, - mainheating=p.main_heating, - main_fuel=p.main_fuel, - mainheat_energy_eff=p.data["mainheat-energy-eff"], - has_wall_insulation_recommendation=has_wall_insulation_recommendation, - has_roof_insulation_recommendation=has_roof_insulation_recommendation, - ) + # funding.check_funding( + # measures=solution, + # starting_sap=int(p.data["current-energy-efficiency"]), + # ending_sap=int(p.data["current-energy-efficiency"]) + sum([x["gain"] for x in solution]), + # floor_area=p.floor_area, + # mainheat_description=p.main_heating["clean_description"], + # heating_control_description=p.main_heating_controls["clean_description"], + # is_cavity=p.walls["is_cavity_wall"], + # current_wall_uvalue=current_wall_u_value, + # is_partial="partial" in p.walls["clean_description"].lower(), + # existing_li_thickness=li_thickness, + # mainheating=p.main_heating, + # main_fuel=p.main_fuel, + # mainheat_energy_eff=p.data["mainheat-energy-eff"], + # has_wall_insulation_recommendation=has_wall_insulation_recommendation, + # has_roof_insulation_recommendation=has_roof_insulation_recommendation, + # ) # Determine the scheme scheme = "none" - if funding.eco4_eligible: - scheme = "eco4" - if scheme == "none" and funding.gbis_eligible: - scheme = "gbis" + # if funding.eco4_eligible: + # scheme = "eco4" + # if scheme == "none" and funding.gbis_eligible: + # scheme = "gbis" - funded_measures = solution if scheme in ["gbis", "eco4"] else [] - project_funding = 0 if funding.full_project_abs is not None else funding.full_project_abs - total_uplift = funding.eco4_uplift - full_project_score = 0 if funding.full_project_abs is not None else funding.full_project_abs - partial_project_score = funding.partial_project_abs - uplift_project_score = funding.eco4_uplift if scheme == "eco4" else funding.gbis_uplift + funded_measures = [] + # funded_measures = solution if scheme in ["gbis", "eco4"] else [] + # project_funding = 0 if funding.full_project_abs is not None else funding.full_project_abs + project_funding = 0 + # total_uplift = funding.eco4_uplift + total_uplift = 0 + # full_project_score = 0 if funding.full_project_abs is not None else funding.full_project_abs + full_project_score = 0 + # partial_project_score = funding.partial_project_abs + partial_project_score = 0 + # uplift_project_score = funding.eco4_uplift if scheme == "eco4" else funding.gbis_uplift + uplift_project_score = 0 selected = {r["id"] for r in solution} diff --git a/etl/customers/lincs_rural/get_missed.py b/etl/customers/lincs_rural/get_missed.py new file mode 100644 index 00000000..d25449c5 --- /dev/null +++ b/etl/customers/lincs_rural/get_missed.py @@ -0,0 +1,47 @@ +# After going back to Lincs rural, they gave us some additional data that we can use to try to fetch missed UPRNs again +import pandas as pd + +# missed = pd.read_excel( +# "/Users/khalimconn-kowlessar/Downloads/lincs_rural_missed_nov_2025.xlsx", +# sheet_name="Missed Properties" +# ) +# missed = missed[~pd.isnull(missed["rrn"])] + +prepared = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/lincs_rural_standardised_ara_nov_2025.xlsx", + sheet_name="Standardised Asset List" +) + +updated_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/MASTER LIST EPCS UPDATED November 2025 Domna Homes - Copy.xlsx", + sheet_name="PROPERTY EPC RATINGS" +) +updated_data = updated_data[~pd.isnull(updated_data["Property Ref."])] + +missed = updated_data[~updated_data["Property Ref."].isin(prepared["landlord_property_id"].values.tolist())].copy() +# missed.to_csv("/Users/khalimconn-kowlessar/Downloads/lincs_rural_missed_uprn.csv") +# We'll grab the UPRNs manually and then pull them in, and prepare for ARA + +missing_uprns = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/lincs_rural_missed_uprn.csv") + +missing_uprns["landlord_property_id"] = missing_uprns["Property Ref."].copy() +missing_uprns["domna_property_id"] = missing_uprns["Property Ref."].copy() +missing_uprns["domna_address_1"] = missing_uprns['Unnamed: 1'].str.split(",").str[0].str.strip() +missing_uprns["postcode"] = missing_uprns['Unnamed: 1'].str.split(",").str[-1].str.strip() +missing_uprns["landlord_property_type"] = "unknown" +missing_uprns["landlord_built_form"] = "unknown" +missing_uprns["domna_full_address"] = missing_uprns['Unnamed: 1'].copy() + +missed_standardised_for_ara = missing_uprns[ + ['landlord_property_id', 'domna_address_1', 'landlord_property_type', 'landlord_built_form', 'postcode', + 'domna_property_id', 'UPRN'] +].rename( + columns={"UPRN": "epc_os_uprn"} +) + +# Store +missed_standardised_for_ara.to_excel( + "/Users/khalimconn-kowlessar/Downloads/lincs_rural_missed_standardised_ara_nov_2025.xlsx", + index=False, + sheet_name="Standardised Asset List" +) diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py b/etl/customers/peabody/Nov 2025 Consulting Project/a_data_prep.py similarity index 100% rename from etl/customers/peabody/Nov 2025 Consulting Project/data_prep.py rename to etl/customers/peabody/Nov 2025 Consulting Project/a_data_prep.py diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/b_data_cleanse.py b/etl/customers/peabody/Nov 2025 Consulting Project/b_data_cleanse.py new file mode 100644 index 00000000..13faa371 --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/b_data_cleanse.py @@ -0,0 +1,147 @@ +""" +We have found, within the Peabody data, a large volume of properties with missing and incorrects +UPRNS and incorrect address data. We want to flag these records and also find missings where we can + +We also have duplicate UPRNS that should be flagged +""" +import json +import time +import os +import pandas as pd +import numpy as np +from tqdm import tqdm +from dotenv import load_dotenv +from asset_list.utils import get_data_for_property +from utils.logger import setup_logger +from utils.s3 import read_io_from_s3, save_dataframe_to_s3_parquet, read_dataframe_from_s3_parquet + +logger = setup_logger() + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +sustainability_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " + "- Data Extracts for Domna.xlsx", + sheet_name="Sustainability" +) +property_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " + "- Data Extracts for Domna.xlsx", + sheet_name="Properties" +) + +missing_uprns = sustainability_data[pd.isnull(sustainability_data['UPRN'])].copy() + +# Any non-numeric UPRNS or leading with 0s are invalid +non_numeric_uprns = sustainability_data[ + ~sustainability_data['UPRN'].astype(str).str.match(r'^[1-9][0-9]*$') & ~pd.isnull(sustainability_data['UPRN']) + ].copy() +# 70 properties +leading_zero_uprns = sustainability_data[ + sustainability_data['UPRN'].astype(str).str.startswith('0') +].copy() + +# Flag duplicates +duplicate_uprns = sustainability_data[ + sustainability_data.duplicated(subset=['UPRN'], keep=False) & ~pd.isnull(sustainability_data['UPRN']) + ].copy() + +# Store this data +# missing_uprns.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting +# Project/data_validation/missing_uprns.csv", index=False) +# non_numeric_uprns.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting +# Project/data_validation/non_numeric_uprns.csv", index=False) +# leading_zero_uprns.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting +# Project/data_validation/leading_zero_uprns.csv", index=False) +# duplicate_uprns.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting +# Project/data_validation/duplicate_uprns.csv", index=False) + +# Take everything remaining +data_needing_validation = sustainability_data[ + ~sustainability_data["Org Ref"].isin( + missing_uprns["Org Ref"].values.tolist() + non_numeric_uprns["Org Ref"].values.tolist() + + leading_zero_uprns["Org Ref"].values.tolist() + duplicate_uprns["Org Ref"].values.tolist() + ) +].copy() + +# TODO: We should build a SAL for UPRNS that are missing, invalid or duplicated + +# We check UPRN validity against our OS data +uprn_filenames = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key="spatial/filename_meta.parquet" +) + +# We're going to: +# 1) Grab a filename +# 2) Read it in +# 3) Check which UPRNS from our data are in that file +# 4) Keep a record of which UPRNS were found where + +for uprn_file in tqdm(uprn_filenames['filenames'].values, total=len(uprn_filenames)): + spatial_data = read_dataframe_from_s3_parquet( + bucket_name="retrofit-data-dev", file_key=f"spatial/{uprn_file}" + ) + + uprns_in_file = data_needing_validation[ + data_needing_validation['UPRN'].astype('Int64').isin(spatial_data['UPRN'].astype('Int64').values) + ].copy() + + print("Found {} UPRNS in file {}".format(len(uprns_in_file), uprn_file)) + if len(uprns_in_file) > 0: + # Store the found UPRNS in the validation cache + data_to_store = uprns_in_file[["Org Ref", "UPRN"]].copy() + data_to_store["Source File"] = uprn_file + # Store + data_to_store.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + f"Project/data_validation/validation_cache/{uprn_file.split('.parquet')[0]}_found_uprns.csv", + index=False + ) + +# Get all of the files: +storage_locations = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation/validation_cache") +# List contents +folder_contents = os.listdir(storage_locations) +# Grab files and concatenate +all_found_uprns = [] +for file in folder_contents: + if file.endswith("_found_uprns.csv"): + df = pd.read_csv(os.path.join(storage_locations, file)) + all_found_uprns.append(df) + +all_found_uprns = pd.concat(all_found_uprns) + +# We now flag any UPRNS that were not found in any of the OS datasets +os_missed_uprns = data_needing_validation[ + ~data_needing_validation['Org Ref'].isin(all_found_uprns['Org Ref'].values.tolist()) +].copy() + +# store +os_missed_uprns.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation/os_missed_uprns.csv", + index=False +) + +# Now build a larger table for standardisation +to_standardised = pd.concat( + [missing_uprns, non_numeric_uprns, leading_zero_uprns, duplicate_uprns, os_missed_uprns] +) + +to_standardised.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation/to_standardise_uprns.xlsx", + index=False) + +# We prepare a finalised dataset to work with, that excludes all problematic properties and leaves us with +# properties for which we have the data we need + +finalised_data = sustainability_data[ + ~sustainability_data["Org Ref"].isin( + to_standardised["Org Ref"].values.tolist() + ) +].copy() + +# Prepare with the column formats we need, as analogous to a_data_prep where we defined an initial working sample diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/c_finalised_modelling_data.py b/etl/customers/peabody/Nov 2025 Consulting Project/c_finalised_modelling_data.py new file mode 100644 index 00000000..2868bce5 --- /dev/null +++ b/etl/customers/peabody/Nov 2025 Consulting Project/c_finalised_modelling_data.py @@ -0,0 +1,114 @@ +import pandas as pd + +# import pandas as pd +# +# sal = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " +# "Project/data_validation/to_standardise_uprns - Standardised.xlsx", +# sheet_name="Standardised Asset List" +# ) +# +# # Quick breadown of missingness +# missing = sal[ +# pd.isnull(sal["estimated"]) | (sal["estimated"] == True) | pd.isnull(sal["epc_os_uprn"]) +# ] +# +# fetched = sal[(sal["estimated"] == False) | ~pd.isnull(sal["epc_os_uprn"])].copy() +# fetched = fetched[ +# ["landlord_property_id", "domna_address_1", "domna_postcode", "domna_full_address", "epc_address1", +# "epc_postcode", "epc_address", "landlord_property_type", "epc_property_type"] +# ] +# +# known_issues = [ +# +# ] +# +# # Missed postcodes +# missed_postcode_agg = missing.groupby("domna_postcode").size().reset_index(name="count") +# missed_postcode_agg = missed_postcode_agg.sort_values("count", ascending=False) +# +# multi_missed_postcode = missed_postcode_agg[missed_postcode_agg["count"] > 1] + +### Prepare +sustainability_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " + "- Data Extracts for Domna.xlsx", + sheet_name="Sustainability" +) + +# Data we want to remove: +missing_uprns = sustainability_data[pd.isnull(sustainability_data['UPRN'])].copy() + +# Any non-numeric UPRNS or leading with 0s are invalid +non_numeric_uprns = sustainability_data[ + ~sustainability_data['UPRN'].astype(str).str.match(r'^[1-9][0-9]*$') & ~pd.isnull(sustainability_data['UPRN']) + ].copy() +# 70 properties +leading_zero_uprns = sustainability_data[ + sustainability_data['UPRN'].astype(str).str.startswith('0') +].copy() + +# Flag duplicates +duplicate_uprns = sustainability_data[ + sustainability_data.duplicated(subset=['UPRN'], keep=False) & ~pd.isnull(sustainability_data['UPRN']) + ].copy() + +# Read in the UPRNs that were not valid based on the OS data +os_missed_uprns = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/data_validation/os_missed_uprns.csv", +) + +modelling_data = sustainability_data[ + ~sustainability_data["Org Ref"].isin( + missing_uprns["Org Ref"].unique().tolist() + non_numeric_uprns["Org Ref"].unique().tolist() + + leading_zero_uprns["Org Ref"].unique().tolist() + duplicate_uprns["Org Ref"].unique().tolist() + + os_missed_uprns["Org Ref"].unique().tolist() + ) +].copy() + +# Need to prepare for upload +# Variables: + + +modelling_data["landlord_property_id"] = sustainability_data["Org Ref"].copy() +modelling_data["domna_property_id"] = sustainability_data["Org Ref"].copy() + +modelling_data = modelling_data.rename( + { + "Address 1": "domna_address_1", + "Postcode": "postcode", + "Type": "landlord_property_type", + "Attachment": "landlord_built_form", + "Heating": "landlord_heating_system", + "UPRN": "epc_os_uprn" + } +) + +modelling_data = modelling_data[ + [ + "domna_address_1", "Address 2", "Address 3", "postcode", "landlord_property_type", + "landlord_built_form", "landlord_heating_system", "epc_os_uprn", "Total Floor Area (m2)", + "domna_property_id", "domna_full_address" + ] +] + +modelling_data["landlord_built_form"] = modelling_data["landlord_built_form"].map( + { + "MidTerrace": "Mid-Terrace", + "EndTerrace": "End-Terrace", + "SemiDetached": "Semi-Detached", + "Detached": "Detached", + "EnclosedEndTerrace": "Enclosed End-Terrace", + "EnclosedMidTerrace": "Enclosed Mid-Terrace", + } +) + + +def make_full_address(x): + to_join = [x['domna_address_1'], x['Address 2'], x['Address 3']] + to_join = [x for x in to_join if not pd.isnull(x) and x != ''] + return ", ".join(to_join) + + +modelling_data["domna_full_address"] = modelling_data.apply(lambda x: make_full_address(x), axis=1) diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py b/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py deleted file mode 100644 index a1be533d..00000000 --- a/etl/customers/peabody/Nov 2025 Consulting Project/data_cleanse.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -We have found, within the Peabody data, a large volume of properties with missing and incorrects -UPRNS and incorrect address data. We want to flag these records and also find missings where we can - -We also have duplicate UPRNS that should be flagged -""" diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 8fa3e13c..74dcfc56 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -844,7 +844,7 @@ class TrainingDataset(BaseDataset): # Make sure they are all efficiency columns if any(~missings.index.str.contains("energy_eff")): - raise ValueError("Non efficiency columns are missing") + raise ValueError(f"Non efficiency columns are missing {missings.index}") for m in missings.index: self.df[m] = self.df[m].fillna("NO_RATING") diff --git a/etl/webscrape/Zoopla.py b/etl/webscrape/Zoopla.py index 4c0443f1..9d15e019 100644 --- a/etl/webscrape/Zoopla.py +++ b/etl/webscrape/Zoopla.py @@ -15,25 +15,10 @@ os.makedirs(CACHE_DIR, exist_ok=True) def random_delay(): - """Pause randomly between requests (0.5–2 s).""" time.sleep(random.uniform(0.5, 2)) -def extract_feature(soup, icon_id): - tag = soup.find("use", href=f"#{icon_id}") - if tag: - parent = tag.find_parent("div", class_="_1pbf8i53") - if parent: - text = parent.get_text(strip=True) - return text - return None - - def extract_embedded_json(text): - """ - Extract embedded property JSON containing attributes, energy, estimates, and sales history. - """ - # Try to grab everything after "attributes" match = re.search( r'"attributes"\s*:\s*\{.*?\}\s*,.*?"historicSales".*?\]', text, @@ -48,13 +33,16 @@ def extract_embedded_json(text): except json.JSONDecodeError: pass - # fallback for independent keys result = {} for key in [ "attributes", "energy", "rentEstimate", "saleEstimate", "saleHistory", "historicSales" ]: - key_match = re.search(rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', text, re.DOTALL) + key_match = re.search( + rf'"{key}"\s*:\s*(\{{.*?\}}|\[.*?\])', + text, + re.DOTALL + ) if key_match: try: result[key] = json.loads(key_match.group(1)) @@ -64,28 +52,23 @@ def extract_embedded_json(text): def scrape_all_estimates(session, url): - """Scrape valuation estimates for one Zoopla property URL.""" resp = session.get(url, impersonate=random.choice(ENGINES)) html = resp.text - page_source = BeautifulSoup(resp.text, "html.parser") - estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) - + soup = BeautifulSoup(html, "html.parser") + estimates = soup.find_all("div", {"data-testid": "sale-estimate"}) data = extract_embedded_json(html) - is_blocked = len(estimates) == 0 - return { "estimates": estimates, - "is_blocked": is_blocked, + "is_blocked": len(estimates) == 0, "response_html": html, - "attributes": data.get("attributes"), - "rent": data.get("rentEstimate"), - "historicSales": data.get("historicSales"), + "attributes": data.get("attributes", {}), + "rentEstimate": data.get("rentEstimate", {}), + "historicSales": data.get("historicSales", []), } def extract_estimates(estimates): - """Extract low, mid, and high estimates from parsed HTML.""" est = estimates[0] low = est.find("span", {"data-testid": "low-estimate-blurred"}).text mid = est.find("p", {"data-testid": "estimate-blurred"}).text @@ -94,110 +77,123 @@ def extract_estimates(estimates): def cache_path_for_url(url): - """Return a deterministic local cache path for a URL.""" uprn = url.split("/")[-2] return os.path.join(CACHE_DIR, f"{uprn}.html") +def parse_cached_html(url, html): + soup = BeautifulSoup(html, "html.parser") + estimates = soup.find_all("div", {"data-testid": "sale-estimate"}) + data = extract_embedded_json(html) + history = data.get("historicSales") or [{}] + + if not estimates: + return None + + low, mid, high = extract_estimates(estimates) + + return { + "URL": url, + "Low Estimate": low, + "Middle Estimate": mid, + "High Estimate": high, + **data.get("attributes", {}), + **data.get("rentEstimate", {}), + **history[0], + } + + def parallel_task(url): - """Main worker function executed in each process.""" cache_path = cache_path_for_url(url) - # Use cached file if it exists if os.path.exists(cache_path): - html = open(cache_path, "r").read() - page_source = BeautifulSoup(html, "html.parser") - estimates = page_source.find_all("div", {"data-testid": "sale-estimate"}) - data = extract_embedded_json(html) - history_sales = data.get("historicSales", [{}]) - if len(history_sales) == 0: - history_sales = [{}] + with open(cache_path, "r", encoding="utf-8") as f: + html = f.read() + cached = parse_cached_html(url, html) + if cached: + return cached - if estimates: - low, mid, high = extract_estimates(estimates) - return { - "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, - **data.get("attributes", {}), **data.get("rentEstimate", {}), - **history_sales[0] - } - - # Otherwise scrape live with StealthSession() as session: - attempts = 0 - while attempts < 5: + for attempt in range(5): output = scrape_all_estimates(session, url) + if not output["is_blocked"] and output["estimates"]: - open(cache_path, "w").write(output["html"]) + html = output.get("response_html") + if html: + with open(cache_path, "w", encoding="utf-8") as f: + f.write(html) + + history = output.get("historicSales") or [{}] low, mid, high = extract_estimates(output["estimates"]) - history_sales = output.get("historicSales", [{}]) - if len(history_sales) == 0: - history_sales = [{}] + return { - "URL": url, "Low Estimate": low, "Middle Estimate": mid, "High Estimate": high, + "URL": url, + "Low Estimate": low, + "Middle Estimate": mid, + "High Estimate": high, **output.get("attributes", {}), - **output.get("rent", {}), - **history_sales[0] + **output.get("rentEstimate", {}), + **history[0], } - attempts += 1 - print(f"[Attempt {attempts}] Blocked or empty for {url}") + random_delay() - # If still blocked, return placeholders - return {"URL": url, "Low Estimate": None, "Middle Estimate": None, "High Estimate": None} + return { + "URL": url, + "Low Estimate": None, + "Middle Estimate": None, + "High Estimate": None, + } def parse_price(p): - if p is None: + if not p: return None p = p.replace("£", "").strip().lower() - if not p: - return None if p.endswith("k"): return float(p[:-1]) * 1_000 - elif p.endswith("m"): + if p.endswith("m"): return float(p[:-1]) * 1_000_000 - else: - try: - return float(p.replace(",", "")) - except ValueError: - return None + + try: + return float(p.replace(",", "")) + except ValueError: + return None if __name__ == "__main__": - # Load portfolio asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/sfr/October 2025 AL portfolio/22.10 AL Portfolio - " - "Standardised - partial UPRN fill.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/modelling_sample.xlsx", sheet_name="Standardised Asset List" ) + asset_list = asset_list[~pd.isnull(asset_list["epc_os_uprn"])] + asset_list = asset_list.drop_duplicates("epc_os_uprn") asset_list["epc_os_uprn"] = asset_list["epc_os_uprn"].astype(int).astype(str) + uprns = asset_list["epc_os_uprn"].tolist() urls = [f"https://www.zoopla.co.uk/property/uprn/{uprn}/" for uprn in uprns] - # Limit concurrency to avoid blocks - with Pool(processes=2) as pool: # fewer processes = fewer fingerprints + with Pool(processes=2) as pool: estimates_list = list( tqdm(pool.imap(parallel_task, urls), total=len(urls)) ) df = pd.DataFrame(estimates_list) - - print(df.head()) - df["uprn"] = df["URL"].str.extract(r"uprn/(\d+)/") df["valuation"] = df["Middle Estimate"].apply(parse_price) df.to_csv("zoopla_estimates.csv", index=False) - # Merge with asset list merged = asset_list.merge( df[["uprn", "valuation"]], left_on="epc_os_uprn", right_on="uprn", how="left" ) + merged.to_excel( "20251029 AL Portfolio - Standardised - with valuations.xlsx", index=False diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index f6618f22..d05275ea 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -11,8 +11,8 @@ from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcMod # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 388 -SCENARIOS = [803] +PORTFOLIO_ID = 404 +SCENARIOS = [829] def get_data(portfolio_id, scenario_ids): @@ -121,7 +121,8 @@ recommendations_measures_pivot["total_retrofit_cost"] = recommendations_measures df = properties_df[ [ - "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", + "landlord_property_id", "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", + "heating", "windows", "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", ] @@ -143,7 +144,7 @@ from utils.s3 import read_csv_from_s3, read_excel_from_s3 # asset_list = read_csv_from_s3(bucket_name="retrofit-plan-inputs-dev", filepath='8/206/asset_list.csv') asset_list = read_excel_from_s3( - bucket_name="retrofit-plan-inputs-dev", file_key='2/388/20251208T203603925Z/asset_list.xlsx', + bucket_name="retrofit-plan-inputs-dev", file_key="2/404/20251211T163200754Z/asset_list.xlsx", header_row=0, sheet_name="Standardised Asset List" ) asset_list = pd.DataFrame(asset_list) @@ -201,11 +202,15 @@ asset_list = asset_list.merge( ) # For exporting -asset_list.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/20251209_sample_package_data.xlsx", +df.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/EPC C -without floors proposed measures - " + "with ID.xlsx", index=False ) +# asset_list.to_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lincs Rural/epc_measures.xlsx", +# index=False +# ) condition_costs = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/sfr/Spring JV/Condition costs.xlsx",