fix missing file

This commit is contained in:
Khalim Conn-Kowlessar 2026-01-13 19:32:53 +00:00
parent c07b012ebb
commit 15725a1d13
6 changed files with 598 additions and 37 deletions

View file

@ -1,34 +1,5 @@
import pandas as pd
# import pandas as pd
#
# sal = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
# "Project/data_validation/to_standardise_uprns - Standardised.xlsx",
# sheet_name="Standardised Asset List"
# )
#
# # Quick breadown of missingness
# missing = sal[
# pd.isnull(sal["estimated"]) | (sal["estimated"] == True) | pd.isnull(sal["epc_os_uprn"])
# ]
#
# fetched = sal[(sal["estimated"] == False) | ~pd.isnull(sal["epc_os_uprn"])].copy()
# fetched = fetched[
# ["landlord_property_id", "domna_address_1", "domna_postcode", "domna_full_address", "epc_address1",
# "epc_postcode", "epc_address", "landlord_property_type", "epc_property_type"]
# ]
#
# known_issues = [
#
# ]
#
# # Missed postcodes
# missed_postcode_agg = missing.groupby("domna_postcode").size().reset_index(name="count")
# missed_postcode_agg = missed_postcode_agg.sort_values("count", ascending=False)
#
# multi_missed_postcode = missed_postcode_agg[missed_postcode_agg["count"] > 1]
### Prepare
sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "

View file

@ -277,3 +277,185 @@ tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().
tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False)
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts()
sample_data = initial_asset_data[
~initial_asset_data["Ownership Type"].isin(
[
# Commercial # Everything is resi - based on the Residential Indicator variable - all are true
# Freeholder
"FREEHOLDER", # 19517 properties
# HOMEBUY / EQUITY LOAN
"Rent to Homebuy", # 1 property
# Leaseholder
"LEASEHOLD 100%", # 8455 properties
"Owned and Managed - 999 year lease", # 2076 properties
"Managed but not Owned-Private Lease", # 159 properties
"Owned and managed LEASEHOLD", # 26 properties
# Outright Sale - can't find anything matching
# SHARED EQUITY
"Shared Ownership", # 4065 properties
"Shared Ownership Owned Not Managed", # 23 properties
# Extra categories which seem sensible to exclude
"NOT MANAGED AND NOT OWNED"
]
)
]
sample_data["Ownership Type"].value_counts()
sample_data = initial_asset_data[
initial_asset_data["Ownership Type"].isin(
[
"Owned and Managed",
"Owned and Managed - 999 year lease",
"Owned and managed LEASEHOLD",
"LEASEHOLD 100%",
"DATALOAD DEFAULT"
]
)
]
dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)]
dropped["Ownership Type"].value_counts()
for value in [
# Commercial # Everything is resi, so should be fine. No matches
# Freeholder
"FREEHOLDER", # 19517 properties
# HOMEBUY / EQUITY LOAN
"Rent to Homebuy", # 1 property
# Leaseholder
"LEASEHOLD 100%", # 8455 properties
"Owned and Managed - 999 year lease", # 2076 properties
"Managed but not Owned-Private Lease", # 159 properties
"Owned and managed LEASEHOLD", # 26 properties
# Outright Sale - can't find anything matching
# SHARED EQUITY
"Shared Ownership", # 4065 properties
"Shared Ownership Owned Not Managed", # 23 properties
]:
print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0])
house_types = [
"HOUSE",
"BUNGALOW",
"MAISONETTE",
"DUPLEX",
]
guaranteed_control = [
"Owned and Managed",
"Owned and Managed - 999 year lease",
"Owned and managed LEASEHOLD",
"LEASEHOLD 100%",
"DATALOAD DEFAULT",
]
sample_data = initial_asset_data[
(
initial_asset_data["Ownership Type"].isin(guaranteed_control)
)
|
(
(initial_asset_data["Ownership Type"] == "FREEHOLDER")
&
(initial_asset_data["Property Type"].isin(house_types))
)
]
fabric_retrofit_sample = initial_asset_data[
initial_asset_data["Ownership Type"].isin(
[
"Owned and Managed",
"FREEHOLDER",
"DATALOAD DEFAULT",
]
)
]
initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts()
z = initial_asset_data[
~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types)
]
block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False)
zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"]
potential_sample = initial_asset_data[
~pd.isnull(initial_asset_data["BlockCode"])
]
compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Property Type",
right_on="Property Type",
suffixes=("_on_block_codes", "_overall")
)
# Comparison of smaller sample vs overall
new_asset_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Properties"
)
new_sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Sustainability"
)
sap_bands = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data "
"08012026.xlsx",
)
combined = new_asset_data.merge(
new_sustainability_data,
left_on="UPRN",
right_on="Org Ref",
suffixes=("_asset", "_sustainability")
).merge(
sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef"
)
reduced_sample = combined[
~combined["AH Tenure"].isin(
["Commercial",
"Freeholder",
"HOMEBUY / EQUITY LOAN",
"Leaseholder",
"Outright Sale",
"SHARED EQUITY",
"Shared Ownership"]
)
].copy()
# property types
property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Property Type",
right_on="Property Type",
suffixes=("_reduced_sample", "_overall")
)
# lodged ratings
lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts(
normalize=True).to_frame().reset_index().merge(
combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Lodged EPC Band",
right_on="Lodged EPC Band",
suffixes=("_reduced_sample", "_overall")
)
# modelled ratings
modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts(
normalize=True).to_frame().reset_index().merge(
combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(),
left_on="SAP Band",
right_on="SAP Band",
suffixes=("_reduced_sample", "_overall")
)

View file

@ -0,0 +1,115 @@
import pandas as pd
initial_asset_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Properties"
)
sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Sustainability"
)
asset_data_v2 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Properties"
)
desired_ownerships = asset_data_v2[
~asset_data_v2["AH Tenure"].isin(
{"Commercial",
"Freeholder",
"HOMEBUY / EQUITY LOAN",
"Leaseholder",
"Outright Sale",
"SHARED EQUITY",
"Shared Ownership"}
)
]
desired_ownerships["Ownership Type"].value_counts()
removed_ownerships = initial_asset_data[
~initial_asset_data["UPRN"].isin(desired_ownerships["UPRN"].values)
]["Ownership Type"].value_counts()
sal = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/20260112 - "
"ownership filtered sal.xlsx",
sheet_name="Standardised Asset List"
)
# What did we include, that we shouldn't have?
should_have_been_dropped = sal[
~sal["landlord_property_id"].isin(desired_ownerships["UPRN"].values)
]
needs_to_be_added = desired_ownerships[
~desired_ownerships["UPRN"].isin(sal["landlord_property_id"].values)
]
# Merge on ownership types
sal = sal.merge(
initial_asset_data[["UPRN", "Ownership Type"]],
left_on="domna_property_id",
right_on="UPRN",
)
# Remove the irrelevant ownership types
sal = sal[
~sal["Ownership Type"].isin(
[
# Commercial # Everything is resi - based on the Residential Indicator variable - all are true
# Freeholder
"FREEHOLDER", # 19517 properties
# HOMEBUY / EQUITY LOAN
"Rent to Homebuy", # 1 property
# Leaseholder
"LEASEHOLD 100%", # 8455 properties
"Owned and Managed - 999 year lease", # 2076 properties
"Managed but not Owned-Private Lease", # 159 properties
"Owned and managed LEASEHOLD", # 26 properties
# Outright Sale - can't find anything matching
# SHARED EQUITY
"Shared Ownership", # 4065 properties
"Shared Ownership Owned Not Managed", # 23 properties
# Extra categories which seem sensible to exclude
"NOT MANAGED AND NOT OWNED"
]
)
]
sal["landlord_property_id"] = sal["domna_property_id"].copy()
# Store this SAL in three batches
filename = (
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/20260112 - "
"ownership filtered sal.xlsx"
)
with pd.ExcelWriter(filename) as writer:
sal.to_excel(writer, sheet_name="Standardised Asset List", index=False)
# Store the three sections
sal[0:20000].to_excel(writer, sheet_name="Batch 1", index=False)
sal[20000:40000].to_excel(writer, sheet_name="Batch 2", index=False)
sal[40000:].to_excel(writer, sheet_name="Batch 3", index=False)
# Test reading back in and assembling
# b1 = pd.read_excel(
# filename,
# sheet_name="Batch 1"
# )
# b2 = pd.read_excel(
# filename,
# sheet_name="Batch 2"
# )
# b3 = pd.read_excel(
# filename,
# sheet_name="Batch 3"
# )
# assembled_sal = pd.concat([b1, b2, b3])
# # Make sure we have the right # of UPRNs
# assert assembled_sal["epc_os_uprn"].nunique() == sal["epc_os_uprn"].nunique()

View file

@ -0,0 +1,293 @@
# ------ Pull in the full SAL sample ------
import pandas as pd
full_sal = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final "
"SAL/Depracated/20260107 corrected batch 6 sal.xlsx",
sheet_name="Standardised Asset List"
)
# ------Pull in the reduced sample ------
# This has a slightly incorrect mix of ownership types. Some properties will need to be dropped and others, added
reduced_sal = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/20260112 - "
"ownership filtered sal.xlsx",
sheet_name="Standardised Asset List"
)
# ------ Pull in the confirmed ownership column from Peabody ------
new_asset_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Properties"
)
correct_sample = new_asset_data[
~new_asset_data["AH Tenure"].isin(
["Commercial",
"Freeholder",
"HOMEBUY / EQUITY LOAN",
"Leaseholder",
"Outright Sale",
"SHARED EQUITY",
"Shared Ownership"]
)
].copy()
# ------- Stuff to add -------
# These are properties that need to be added to the reduced sample, from the SAL
stuff_to_add = correct_sample[
~correct_sample["UPRN"].isin(reduced_sal["landlord_property_id"].values)
]["UPRN"].values
sal_to_add = full_sal[
full_sal["domna_property_id"].isin(stuff_to_add)
].copy()
# ------- Stuff to remove -------
stuff_to_remove = reduced_sal[
~reduced_sal["landlord_property_id"].isin(correct_sample["UPRN"].values)
]["landlord_property_id"].values
to_delete = reduced_sal[
reduced_sal["landlord_property_id"].isin(stuff_to_remove)
].copy()
# ------- Create the correctly formatted SAL, with an individual batch for properties we need to add -------
# This is what is correct, from the reduced sample, after removing the incorrect ownership types
reduced_sal_final = reduced_sal[
~reduced_sal["landlord_property_id"].isin(stuff_to_remove)
].copy()
sal_to_add["landlord_property_id"] = sal_to_add["domna_property_id"].copy()
full_sal = pd.concat(
[reduced_sal_final, sal_to_add],
)
# filename = (
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/20260113 - "
# "final asset list.xlsx"
# )
# with pd.ExcelWriter(filename) as writer:
# full_sal.to_excel(writer, sheet_name="Standardised Asset List", index=False)
# # Store the three sections
# reduced_sal_final[0:25000].to_excel(writer, sheet_name="Batch 1 - was correct", index=False)
# reduced_sal_final[25000:].to_excel(writer, sheet_name="Batch 2 - was correct", index=False)
# sal_to_add.to_excel(writer, sheet_name="Batch 3 - needs adding", index=False)
# We now prepare the process of getting the associated
# We have the properties we need to delete. We can get their associated plans for all scenario IDs
scenario_ids = [908, 909, 910]
import pandas as pd
from sqlalchemy.orm import Session
from backend.app.db.models.portfolio import PropertyModel
from backend.app.db.connection import db_session, db_read_session
from sqlalchemy import select, func
from sqlalchemy.orm import Session
from backend.app.db.models.recommendations import Plan
uprns_to_be_deleted = to_delete["epc_os_uprn"].values.tolist()
# PORTFOLIO_ID = 435
# SCENARIO_ID_WITH_PLANS_TO_DELETE = 910
# Get the property IDs for these UPRNs
# def get_property_ids_for_uprns(session: Session, uprns: list[int], portfolio_id) -> list[int]:
# return [
# property_id
# for (property_id,) in
# session.query(PropertyModel.id)
# .filter(
# PropertyModel.uprn.in_(uprns),
# PropertyModel.portfolio_id == portfolio_id
# )
# .all()
# ]
#
#
# with db_read_session() as session:
# property_ids_to_delete = get_property_ids_for_uprns(
# session, uprns_to_be_deleted, portfolio_id=PORTFOLIO_ID
# )
#
#
# def count_plans_for_scenario(session: Session, scenario_id: int, portfolio_id, property_ids) -> int:
# return session.execute(
# select(func.count())
# .select_from(Plan)
# .where(
# Plan.scenario_id == scenario_id,
# Plan.portfolio_id == portfolio_id,
# Plan.property_id.in_(property_ids)
# )
# ).scalar_one()
#
#
# with db_session() as session:
# n_plans = count_plans_for_scenario(
# session,
# scenario_id=SCENARIO_ID_WITH_PLANS_TO_DELETE,
# portfolio_id=PORTFOLIO_ID,
# property_ids=property_ids_to_delete
# )
#
#
# def get_plan_ids_for_scenario(
# session: Session, scenario_id: int, portfolio_id, property_ids
# ) -> list[int]:
# result = session.execute(
# select(Plan.id, Plan.property_id)
# .where(
# Plan.scenario_id == scenario_id,
# Plan.portfolio_id == portfolio_id,
# Plan.property_id.in_(property_ids)
# )
# )
# return [{"plan_id": row.id, "property_id": row.property_id} for row in result]
#
#
# with db_session() as session:
# plan_ids_to_property = get_plan_ids_for_scenario(
# session,
# scenario_id=SCENARIO_ID_WITH_PLANS_TO_DELETE,
# portfolio_id=PORTFOLIO_ID,
# property_ids=property_ids_to_delete
# )
#
# df = pd.DataFrame(plan_ids_to_property)
# df[df["property_id"].duplicated()].shape
#
# plan_ids = [row["plan_id"] for row in plan_ids_to_property]
#
#
# def chunked(iterable, size):
# for i in range(0, len(iterable), size):
# yield iterable[i:i + size]
#
#
# from sqlalchemy import text
# from sqlalchemy.orm import Session
#
#
# def delete_plan_batch(session: Session, plan_ids: list[int]):
# if not plan_ids:
# return
#
# session.execute(text("SET LOCAL lock_timeout = '5s'"))
#
# params = {"plan_ids": plan_ids}
#
# # ----------------------------
# # recommendation_materials
# # ----------------------------
# session.execute(
# text("""
# DELETE FROM recommendation_materials rm
# USING plan_recommendations pr
# WHERE rm.recommendation_id = pr.recommendation_id
# AND pr.plan_id = ANY(:plan_ids)
# """),
# params,
# )
#
# # ----------------------------
# # plan_recommendations
# # ----------------------------
# session.execute(
# text("""
# DELETE FROM plan_recommendations
# WHERE plan_id = ANY(:plan_ids)
# """),
# params,
# )
#
# # ----------------------------
# # recommendations (only those used by these plans)
# # ----------------------------
# session.execute(
# text("""
# DELETE FROM recommendation r
# WHERE r.id IN (
# SELECT DISTINCT recommendation_id
# FROM plan_recommendations
# WHERE plan_id = ANY(:plan_ids)
# )
# """),
# params,
# )
#
# # ----------------------------
# # plans LAST
# # ----------------------------
# session.execute(
# text("""
# DELETE FROM plan
# WHERE id = ANY(:plan_ids)
# """),
# params,
# )
#
#
# batch_size = 25
# total = (len(plan_ids) + batch_size - 1) // batch_size
#
# for i, batch in enumerate(chunked(plan_ids, batch_size), start=1):
# print(f"Deleting plan batch {i}/{total} ({len(batch)} plans)")
#
# with db_session() as session:
# delete_plan_batch(session, batch)
#
# print(f"Batch {i} committed")
#
# # Now, we delete the associated properties in batch and associated objects. It should
# # largely be property, property details
# property_ids_to_delete
#
# from sqlalchemy import text
# from sqlalchemy.orm import Session
#
#
# def move_properties_between_portfolios(
# session: Session,
# property_ids: list[int],
# from_portfolio_id: int,
# to_portfolio_id: int,
# ):
# if not property_ids:
# return 0
#
# result = session.execute(
# text("""
# UPDATE property
# SET portfolio_id = :to_portfolio_id
# WHERE portfolio_id = :from_portfolio_id
# AND id = ANY(:property_ids)
# """),
# {
# "property_ids": property_ids,
# "from_portfolio_id": from_portfolio_id,
# "to_portfolio_id": to_portfolio_id,
# },
# )
#
# return result.rowcount
#
#
# # Moved?
# # 573476, 586011
#
# property_ids_to_delete2 = [x for x in property_ids_to_delete if x not in [573476, 586011]]
#
# with db_session() as session:
# n_moved = move_properties_between_portfolios(
# session,
# property_ids=property_ids_to_delete2,
# from_portfolio_id=PORTFOLIO_ID,
# to_portfolio_id=32, # Archive portfolio
# )

View file

@ -87,8 +87,6 @@ resource "aws_db_instance" "default" {
apply_immediately = true
# Set up storage type to gp3 for better performance
storage_type = "gp3"
# Instance size
instance_class = "db.t4g.medium"
}
# Set up the bucket that recieve the csv uploads of epc to be retrofit

View file

@ -14,14 +14,16 @@ from collections import defaultdict
# PORTFOLIO_ID = 206
# SCENARIOS = [389]
PORTFOLIO_ID = 434 # Peabody
PORTFOLIO_ID = 435 # Peabody
SCENARIOS = [
904,
905
908,
909,
# 910,
]
scenario_names = {
904: "EPC C - no solid floor, ashp 3.0",
905: "EPC B - no solid floor, ashp 3.0",
908: "EPC C - no solid floor, ashp 3.0",
909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0",
# 910: "EPC B - no solid floor, no EWI, ashp 3.0"
}
@ -231,7 +233,7 @@ for scenario_id in SCENARIOS:
# Create excel to store to
filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
f"Project/Final SAL/{scenario_names[scenario_id]} - corrected.xlsx")
f"Project/Final SAL/{scenario_names[scenario_id]} - 20250113 final.xlsx")
with pd.ExcelWriter(filename) as writer:
df.to_excel(writer, sheet_name="properties", index=False)