mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
536 lines
20 KiB
Python
536 lines
20 KiB
Python
import os
|
|
import json
|
|
import pandas as pd
|
|
from pprint import pprint
|
|
import msgpack
|
|
from utils.s3 import read_from_s3
|
|
from asset_list.AssetList import AssetList
|
|
from asset_list.mappings.property_type import PROPERTY_MAPPING
|
|
from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS
|
|
from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
|
|
from asset_list.mappings.heating_systems import HEATING_MAPPINGS
|
|
from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
|
|
from asset_list.mappings.roof import ROOF_CONSTRUCTION_MAPPINGS
|
|
from asset_list.utils import get_data
|
|
|
|
from dotenv import load_dotenv
|
|
from backend.SearchEpc import SearchEpc
|
|
|
|
load_dotenv(dotenv_path="backend/.env")
|
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|
|
|
|
|
def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
|
|
if method == "first_two_words":
|
|
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
|
return asset_list
|
|
|
|
if method == "first_word":
|
|
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
|
|
return asset_list
|
|
|
|
if method == "house_number_extraction":
|
|
asset_list["address1_extracted"] = asset_list.apply(
|
|
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
|
axis=1
|
|
)
|
|
return asset_list
|
|
|
|
raise ValueError(f"Method {method} not recognized")
|
|
|
|
|
|
def app():
|
|
"""
|
|
This app is EPC pulling data for some properties owned by Livewest
|
|
|
|
Data request contents:
|
|
Date of last EPC
|
|
Reason for EPC
|
|
SAP score on register
|
|
Property Type
|
|
Property Area
|
|
Property Age
|
|
Any Dimensions (HLP,PW,RH)
|
|
Property Wall Construction
|
|
Heating Type
|
|
Secondary Heating
|
|
Loft Insulation Depth
|
|
|
|
Additional if possible:
|
|
Heat loss calculations
|
|
EPC recommendations
|
|
Property UPRN
|
|
"""
|
|
|
|
# Thurrock
|
|
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Thurrock"
|
|
data_filename = "THURROCK COUNCIL.xlsx"
|
|
sheet_name = "Assets"
|
|
postcode_column = 'Postcode'
|
|
fulladdress_column = "Full Address"
|
|
address1_column = None
|
|
address1_method = "house_number_extraction"
|
|
address_cols_to_concat = []
|
|
missing_postcodes_method = None
|
|
landlord_year_built = "Construction Date"
|
|
landlord_os_uprn = None
|
|
landlord_property_type = "Property Type"
|
|
landlord_built_form = "Property Subtype"
|
|
landlord_wall_construction = None
|
|
landlord_roof_construction = None
|
|
landlord_heating_system = "Main Heating Type"
|
|
landlord_existing_pv = None
|
|
landlord_property_id = "Property Reference"
|
|
landlord_sap = None
|
|
outcomes_filename = []
|
|
outcomes_sheetname = []
|
|
outcomes_postcode = []
|
|
outcomes_houseno = []
|
|
outcomes_id = []
|
|
outcomes_address = []
|
|
master_filepaths = []
|
|
master_to_asset_list_filepath = None
|
|
phase = False
|
|
ecosurv_landlords = None
|
|
|
|
# Medway
|
|
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Medway"
|
|
data_filename = "MEDWAY Asset List.xlsx"
|
|
sheet_name = "Asset list"
|
|
postcode_column = 'Postcode'
|
|
fulladdress_column = None
|
|
address1_column = "House Number"
|
|
address1_method = None
|
|
address_cols_to_concat = ["House Number", "Street 1"]
|
|
missing_postcodes_method = None
|
|
landlord_year_built = "Year Built"
|
|
landlord_os_uprn = None
|
|
landlord_property_type = "Property Type - Academy"
|
|
landlord_built_form = "Property Type - Academy"
|
|
landlord_wall_construction = None
|
|
landlord_roof_construction = None
|
|
landlord_heating_system = None
|
|
landlord_existing_pv = None
|
|
landlord_property_id = "Row ID"
|
|
landlord_sap = None
|
|
outcomes_filename = []
|
|
outcomes_sheetname = []
|
|
outcomes_postcode = []
|
|
outcomes_houseno = []
|
|
outcomes_id = []
|
|
outcomes_address = []
|
|
master_filepaths = []
|
|
master_to_asset_list_filepath = None
|
|
phase = False
|
|
ecosurv_landlords = None
|
|
|
|
# MHS
|
|
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS"
|
|
data_filename = "MHS HOMES (Full Asset List) - for programme build.xlsx"
|
|
sheet_name = "Sheet1"
|
|
postcode_column = 'Postcode'
|
|
fulladdress_column = "FullAddress"
|
|
address1_column = None
|
|
address1_method = "house_number_extraction"
|
|
address_cols_to_concat = []
|
|
missing_postcodes_method = None
|
|
landlord_year_built = "BuiltInYear"
|
|
landlord_os_uprn = None
|
|
landlord_property_type = "AssetType"
|
|
landlord_built_form = "PropertyType"
|
|
landlord_wall_construction = None
|
|
landlord_roof_construction = None
|
|
landlord_heating_system = None
|
|
landlord_existing_pv = None
|
|
landlord_property_id = "UPRN"
|
|
landlord_sap = None
|
|
outcomes_filename = []
|
|
outcomes_sheetname = []
|
|
outcomes_postcode = []
|
|
outcomes_houseno = []
|
|
outcomes_id = []
|
|
outcomes_address = []
|
|
master_filepaths = []
|
|
master_to_asset_list_filepath = None
|
|
phase = False
|
|
ecosurv_landlords = None
|
|
|
|
# Southern Midlands
|
|
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025"
|
|
# data_filename = "Southern Housing Midlands Property List - combined.xlsx"
|
|
# sheet_name = "Sheet 1"
|
|
# postcode_column = 'Post Code'
|
|
# fulladdress_column = "Address"
|
|
# address1_column = None
|
|
# address1_method = "house_number_extraction"
|
|
# address_cols_to_concat = []
|
|
# missing_postcodes_method = None
|
|
# landlord_year_built = "Age_1"
|
|
# landlord_os_uprn = None
|
|
# landlord_property_type = "Prop_Type"
|
|
# landlord_built_form = "Prop_Type"
|
|
# landlord_wall_construction = "Walls_P"
|
|
# landlord_heating_system = "Heating System"
|
|
# landlord_existing_pv = None
|
|
# landlord_property_id = "AssetID"
|
|
# outcomes_filename = None
|
|
# outcomes_sheetname = None
|
|
# outcomes_postcode = None
|
|
# outcomes_houseno = None
|
|
# outcomes_id = None
|
|
# outcomes_address = None
|
|
# master_filepaths = []
|
|
# master_to_asset_list_filepath = None
|
|
|
|
# Maps addresses to uprn in problematic cases
|
|
manual_uprn_map = {}
|
|
|
|
asset_list = AssetList(
|
|
local_filepath=os.path.join(data_folder, data_filename),
|
|
header=0,
|
|
sheet_name=sheet_name,
|
|
address1_colname=address1_column,
|
|
postcode_colname=postcode_column,
|
|
landlord_property_id=landlord_property_id,
|
|
full_address_colname=fulladdress_column,
|
|
full_address_cols_to_concat=address_cols_to_concat,
|
|
missing_postcodes_method=missing_postcodes_method,
|
|
address1_extraction_method=address1_method,
|
|
landlord_year_built=landlord_year_built,
|
|
landlord_uprn=landlord_os_uprn,
|
|
landlord_property_type=landlord_property_type,
|
|
landlord_built_form=landlord_built_form,
|
|
landlord_wall_construction=landlord_wall_construction,
|
|
landlord_roof_construction=landlord_roof_construction,
|
|
landlord_heating_system=landlord_heating_system,
|
|
landlord_existing_pv=landlord_existing_pv,
|
|
landlord_sap=landlord_sap,
|
|
phase=phase
|
|
)
|
|
asset_list.init_standardise()
|
|
|
|
# We produce the new maps, which can be saved for future useage
|
|
new_property_type_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_property_type] if
|
|
asset_list.landlord_property_type else {}
|
|
).items()
|
|
if k not in PROPERTY_MAPPING
|
|
}
|
|
new_built_form_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_built_form] if
|
|
asset_list.landlord_built_form else {}
|
|
).items()
|
|
if k not in BUILT_FORM_MAPPINGS
|
|
}
|
|
new_wall_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_wall_construction] if
|
|
asset_list.landlord_wall_construction else {}
|
|
).items()
|
|
if k not in WALL_CONSTRUCTION_MAPPINGS
|
|
}
|
|
new_heating_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_heating_system] if
|
|
asset_list.landlord_heating_system else {}
|
|
).items()
|
|
if k not in HEATING_MAPPINGS
|
|
}
|
|
new_existing_pv_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {}
|
|
).items()
|
|
if k not in EXISTING_PV_MAPPINGS
|
|
}
|
|
new_roof_construction_map = {
|
|
k: v for k, v in (
|
|
asset_list.variable_mappings[asset_list.landlord_roof_construction] if
|
|
asset_list.landlord_roof_construction else {}
|
|
).items()
|
|
if k not in ROOF_CONSTRUCTION_MAPPINGS
|
|
}
|
|
|
|
asset_list.apply_standardiation()
|
|
|
|
# We now flag properties that have been treated under existing programmes
|
|
asset_list.flag_outcomes(
|
|
outcomes_filepaths=outcomes_filename,
|
|
outcomes_sheetname=outcomes_sheetname,
|
|
outcomes_address=outcomes_address,
|
|
outcomes_postcode=outcomes_postcode,
|
|
outcomes_houseno=outcomes_houseno,
|
|
outcomes_id=outcomes_id
|
|
)
|
|
|
|
asset_list.flag_survey_master(
|
|
master_filepaths=master_filepaths,
|
|
master_to_asset_list_filepath=master_to_asset_list_filepath
|
|
)
|
|
|
|
asset_list.flag_ecosurv(ecosurv_landlords)
|
|
|
|
### We retrieve the EPC data
|
|
|
|
# We chunk up this data into 5000 rows at a time
|
|
# Create the chunks directory
|
|
epc_api_only = False
|
|
force_retrieve_data = False
|
|
skip = None # Used to skip already completed chunks
|
|
chunk_size = 5000
|
|
filename = "Chunk {i}.csv"
|
|
download_folder = os.path.join(data_folder, "Chunks")
|
|
if not os.path.exists(download_folder):
|
|
os.makedirs(download_folder)
|
|
|
|
chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
|
|
downloaded_files = {filename.format(i=i) for i in chunk_indexes}
|
|
|
|
# We check if we have files associated to these files already and if we do, and we do not want to force the
|
|
# fetching of the data, we skip
|
|
folder_contents = os.listdir(download_folder)
|
|
if all(x in folder_contents for x in downloaded_files):
|
|
skip = max(chunk_indexes)
|
|
|
|
if any(x in folder_contents for x in downloaded_files):
|
|
skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents])
|
|
|
|
for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
|
|
print(f"Processing chunk {i} to {i + chunk_size}")
|
|
if skip is not None and not force_retrieve_data:
|
|
if i <= skip:
|
|
continue
|
|
chunk = asset_list.standardised_asset_list[i:i + chunk_size]
|
|
epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
|
|
df=chunk,
|
|
row_id_name=asset_list.DOMNA_PROPERTY_ID,
|
|
uprn_column=AssetList.STANDARD_UPRN,
|
|
fulladdress_column=AssetList.STANDARD_FULL_ADDRESS,
|
|
address1_column=AssetList.STANDARD_ADDRESS_1,
|
|
postcode_column=AssetList.STANDARD_POSTCODE,
|
|
property_type_column=AssetList.STANDARD_PROPERTY_TYPE,
|
|
built_form_column=AssetList.STANDARD_BUILT_FORM,
|
|
manual_uprn_map=manual_uprn_map,
|
|
epc_api_only=epc_api_only,
|
|
epc_auth_token=EPC_AUTH_TOKEN
|
|
)
|
|
|
|
# We now retrieve any failed properties
|
|
chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
|
|
epc_data_failed, _, _ = get_data(
|
|
df=chunk_failed,
|
|
row_id_name=asset_list.DOMNA_PROPERTY_ID,
|
|
uprn_column=AssetList.STANDARD_UPRN,
|
|
fulladdress_column=AssetList.STANDARD_FULL_ADDRESS,
|
|
address1_column=AssetList.STANDARD_ADDRESS_1,
|
|
postcode_column=AssetList.STANDARD_POSTCODE,
|
|
property_type_column=AssetList.STANDARD_PROPERTY_TYPE,
|
|
built_form_column=AssetList.STANDARD_BUILT_FORM,
|
|
manual_uprn_map=manual_uprn_map,
|
|
epc_api_only=epc_api_only,
|
|
epc_auth_token=EPC_AUTH_TOKEN
|
|
)
|
|
|
|
epc_data_chunk.extend(epc_data_failed)
|
|
|
|
# Append the failed data to the main data
|
|
# Store the chunk locally as a csv
|
|
pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False)
|
|
# Store the errors and no-data locally
|
|
with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f:
|
|
json.dump(errors_chunk, f)
|
|
|
|
with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
|
|
json.dump(no_epc_chunk, f)
|
|
|
|
# We read in and concatenate the created created chunks
|
|
# List the contents
|
|
epc_data = []
|
|
for file in downloaded_files:
|
|
csv_data = pd.read_csv(os.path.join(download_folder, file))
|
|
# We need to convert the recommendations back to a list
|
|
csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
|
|
# We don't have this if we didn't run the pulling from find my epc
|
|
if "find_my_epc_data" in csv_data.columns:
|
|
csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
|
|
epc_data.append(csv_data)
|
|
|
|
epc_df = pd.concat(epc_data)
|
|
if "estimated" not in epc_df.columns:
|
|
epc_df["estimated"] = False
|
|
|
|
epc_df["estimated"] = epc_df["estimated"].fillna(False)
|
|
|
|
# We expand out the recommendations
|
|
recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
|
|
|
|
unique_recommendations = set()
|
|
for _, row in recommendations_df.iterrows():
|
|
unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
|
|
|
|
columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
|
|
transformed_data = []
|
|
for _, row in recommendations_df.iterrows():
|
|
# Initialize a dictionary for this row with False for all recommendations
|
|
row_data = {col: False for col in columns}
|
|
row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
|
|
|
|
# Set True for each recommendation present in this row
|
|
for rec in row["recommendations"]:
|
|
recommendation_text = rec["improvement-summary-text"]
|
|
row_data[recommendation_text] = True
|
|
|
|
# Append the row data to transformed_data
|
|
transformed_data.append(row_data)
|
|
|
|
transformed_df = pd.DataFrame(transformed_data)
|
|
for col in [
|
|
"Floor insulation (solid floor)",
|
|
"Floor insulation", "Floor insulation (suspended floor)"
|
|
]:
|
|
if col not in transformed_df.columns:
|
|
transformed_df[col] = False
|
|
transformed_df = transformed_df[
|
|
[
|
|
asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
|
|
"Floor insulation", "Floor insulation (suspended floor)"
|
|
]
|
|
]
|
|
|
|
transformed_df["epc_has_floor_recommendation"] = (
|
|
transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
|
|
transformed_df["Floor insulation (suspended floor)"]
|
|
)
|
|
|
|
# Get the find my epc data
|
|
if "find_my_epc_data" not in epc_df.columns:
|
|
epc_df["find_my_epc_data"] = None
|
|
|
|
find_my_epc_data = []
|
|
for _, x in epc_df.iterrows():
|
|
if x["find_my_epc_data"]:
|
|
find_my_epc_data.append(
|
|
{
|
|
asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID],
|
|
**x["find_my_epc_data"]
|
|
}
|
|
)
|
|
else:
|
|
find_my_epc_data.append(
|
|
{
|
|
asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID]
|
|
}
|
|
)
|
|
|
|
find_my_epc_data = pd.DataFrame(find_my_epc_data)
|
|
|
|
find_my_epc_data = find_my_epc_data.merge(
|
|
transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
|
|
how="left", on=asset_list.DOMNA_PROPERTY_ID
|
|
)
|
|
|
|
# We check if we get the solar pv column:
|
|
if "Solar photovoltaics" not in find_my_epc_data.columns:
|
|
find_my_epc_data["Solar photovoltaics"] = False
|
|
|
|
# Retrieve just the data we need
|
|
epc_df = epc_df[
|
|
[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
|
|
].rename(
|
|
columns=asset_list.EPC_API_DATA_NAMES
|
|
)
|
|
|
|
# Look for columns not in the find my EPC data, which will have happened if we didn't
|
|
# retrieve it in the first place
|
|
missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns]
|
|
if missed_find_epc_cols:
|
|
for c in missed_find_epc_cols:
|
|
find_my_epc_data[c] = None
|
|
|
|
epc_df = epc_df.merge(
|
|
find_my_epc_data[
|
|
[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
|
|
]
|
|
.rename(columns=asset_list.FIND_EPC_DATA_NAMES),
|
|
how="left",
|
|
on=asset_list.DOMNA_PROPERTY_ID
|
|
)
|
|
|
|
asset_list.merge_data(epc_df)
|
|
|
|
asset_list.extract_attributes()
|
|
|
|
cleaned = read_from_s3(
|
|
s3_file_name="cleaned_epc_data/cleaned.bson",
|
|
bucket_name="retrofit-data-dev"
|
|
)
|
|
cleaned = msgpack.unpackb(cleaned, raw=False)
|
|
|
|
asset_list.identify_worktypes(cleaned)
|
|
|
|
pprint(asset_list.work_type_figures)
|
|
|
|
asset_list.flat_analysis()
|
|
|
|
asset_list.load_contact_details(
|
|
local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"),
|
|
sheet_name="Report 1",
|
|
landlord_property_id=asset_list.landlord_property_id,
|
|
phone_number_column='Property Current Tel. Number',
|
|
fullname_column='Proeprty Current Occupant',
|
|
firstname_column=None,
|
|
lastname_column=None,
|
|
email_column=None, # TODO - we need this
|
|
)
|
|
|
|
# Convert to a format suitable for CRM
|
|
# TODO: TEMP
|
|
assigned_surveyors = pd.DataFrame(
|
|
[
|
|
{
|
|
asset_list.landlord_property_id: "02610001",
|
|
"week_commencing": "10/10/2025",
|
|
"surveyor_name": "Khalim Conn-Kowlessar",
|
|
"surveyor_email": "khalim@domna.homes",
|
|
}
|
|
]
|
|
)
|
|
|
|
# TODO: Sort the output by postcode
|
|
|
|
company_domain = "ealing.gov.uk"
|
|
crm_pipeline_name = "Survey Management"
|
|
first_dealstage = "READY TO BEGIN SCHEDULING"
|
|
# TODO - temp, upload to either SharePoint or AWS
|
|
|
|
asset_list.prepare_for_crm(
|
|
assigned_surveyors=assigned_surveyors,
|
|
company_domain=company_domain,
|
|
crm_pipeline_name=crm_pipeline_name,
|
|
first_dealstage=first_dealstage
|
|
)
|
|
hubspot_data = asset_list.hubspot_data
|
|
|
|
# Store as an excel
|
|
filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx"
|
|
# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
|
|
|
|
with pd.ExcelWriter(filename) as writer:
|
|
asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
|
|
asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
|
|
# If we have outcomes, we add a tab with the outcomes
|
|
if not asset_list.outcomes_for_output.empty:
|
|
asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False)
|
|
|
|
if not asset_list.unmatched_submissions.empty:
|
|
asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False)
|
|
|
|
if not asset_list.outcomes_no_match.empty:
|
|
asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False)
|
|
|
|
if not asset_list.ecosurv_no_match.empty:
|
|
asset_list.ecosurv_no_match.to_excel(writer, sheet_name="Unmatched Ecosurv", index=False)
|
|
|
|
# Store the Hubspot export as a csv
|
|
hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False)
|