mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
adding matcing from sumissions sheet to asset list
This commit is contained in:
parent
2d71ad25ef
commit
3cfe938e27
16 changed files with 509 additions and 45 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="AssetList" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="AssetList" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -4,6 +4,8 @@ import re
|
|||
import tiktoken
|
||||
from pprint import pprint
|
||||
from datetime import datetime
|
||||
|
||||
from docutils.utils.math.tex2mathml_extern import blahtexml
|
||||
from openai import OpenAI
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
|
@ -663,7 +665,10 @@ class AssetList:
|
|||
non_intrusive_columns.append(self.NON_INTRUSIVES_ELIGIBILITY_COLUMN)
|
||||
|
||||
if self.old_format_non_intrusives_present:
|
||||
non_intrusive_columns = self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES
|
||||
# We check if we have the ECO Eligibility column, which we might not have
|
||||
non_intrusive_columns = [
|
||||
c for c in self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES if c in self.standardised_asset_list.columns
|
||||
]
|
||||
|
||||
self.keep_variables += non_intrusive_columns
|
||||
|
||||
|
|
@ -731,7 +736,7 @@ class AssetList:
|
|||
'PIMSS EMPTY'
|
||||
]
|
||||
|
||||
if pd.isnull(date_str) or date_str in known_errors:
|
||||
if pd.isnull(date_str) or date_str in known_errors or (date_str == 0):
|
||||
return None
|
||||
|
||||
if isinstance(date_str, str):
|
||||
|
|
@ -752,6 +757,10 @@ class AssetList:
|
|||
if isinstance(date_str, datetime):
|
||||
return date_str.year
|
||||
|
||||
if isinstance(date_str, float):
|
||||
if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4):
|
||||
return int(date_str)
|
||||
|
||||
# Check if date_str is a year itself
|
||||
if str(date_str).isdigit() & (len(str(date_str)) == 4):
|
||||
return int(date_str)
|
||||
|
|
@ -1325,7 +1334,7 @@ class AssetList:
|
|||
)
|
||||
self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = (
|
||||
self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
|
||||
["electric storage heaters", "room heaters", "electric radiators"]
|
||||
["electric storage heaters", "room heaters", "electric radiators", "no heating"]
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -2099,6 +2108,9 @@ class AssetList:
|
|||
nomatch = []
|
||||
for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)):
|
||||
|
||||
if pd.isnull(x[outcomes_address]):
|
||||
continue
|
||||
|
||||
# Check if we have an id
|
||||
oid = x[outcomes_id] if outcomes_id is not None else None
|
||||
|
||||
|
|
@ -2120,6 +2132,8 @@ class AssetList:
|
|||
|
||||
address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ")
|
||||
|
||||
self.outcomes["Outcome"] = self.outcomes["Outcome"].str.lower()
|
||||
|
||||
matched = self.standardised_asset_list[
|
||||
(self.standardised_asset_list[
|
||||
self.STANDARD_FULL_ADDRESS
|
||||
|
|
@ -2140,7 +2154,9 @@ class AssetList:
|
|||
].copy()
|
||||
if not matched.empty:
|
||||
matched["houseno"] = matched.apply(
|
||||
lambda x: SearchEpc.get_house_number(x[self.STANDARD_ADDRESS_1], x[self.STANDARD_POSTCODE]),
|
||||
lambda x: SearchEpc.get_house_number(
|
||||
str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE])
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
|
||||
|
|
@ -2155,8 +2171,6 @@ class AssetList:
|
|||
}
|
||||
)
|
||||
continue
|
||||
elif matched.shape[0] > 1:
|
||||
raise NotImplementedError("Check me")
|
||||
elif not matched.empty:
|
||||
# Use levenstein distance to match
|
||||
matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE]
|
||||
|
|
@ -2254,19 +2268,123 @@ class AssetList:
|
|||
"SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS"
|
||||
)
|
||||
|
||||
# We just need to check if any were cancelled
|
||||
master_to_append = master_data[
|
||||
["UPRN", install_col, submission_col]
|
||||
].rename(
|
||||
if "UPRN" in master_data.columns:
|
||||
# We just need to check if any were cancelled
|
||||
master_to_append = master_data[
|
||||
["UPRN", install_col, submission_col]
|
||||
].rename(
|
||||
columns={
|
||||
"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
|
||||
install_col: "survey_status",
|
||||
submission_col: "submission_date"
|
||||
}
|
||||
)
|
||||
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
|
||||
|
||||
master_surveyed.append(master_to_append)
|
||||
continue
|
||||
|
||||
master_data["row_id"] = master_data.index
|
||||
|
||||
self.standardised_asset_list["house_no"] = self.standardised_asset_list.apply(
|
||||
lambda x: SearchEpc.get_house_number(
|
||||
str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE])
|
||||
),
|
||||
axis=1
|
||||
)
|
||||
|
||||
# Otherwise, we need to match algorithmically
|
||||
logger.info("Matching master data to asset list")
|
||||
matched = []
|
||||
unmatched = []
|
||||
for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
|
||||
if pd.isnull(row["POSTCODE"]):
|
||||
continue
|
||||
postcode_no_space = row["POSTCODE"].strip().replace(" ", "").lower()
|
||||
|
||||
df = self.standardised_asset_list[
|
||||
(
|
||||
self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ",
|
||||
"")
|
||||
== postcode_no_space
|
||||
)
|
||||
]
|
||||
|
||||
house_no = row["NO"]
|
||||
|
||||
if house_no in df["house_no"].values:
|
||||
df = df[df["house_no"] == house_no]
|
||||
if df.shape[0] != 1:
|
||||
# Levenstein distance
|
||||
|
||||
if any(df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])):
|
||||
df = df[
|
||||
df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])
|
||||
]
|
||||
else:
|
||||
# Levenstein distance
|
||||
df = df[
|
||||
df[self.STANDARD_FULL_ADDRESS].str.lower().apply(
|
||||
lambda x: process.extractOne(
|
||||
" ".join([row["NO"], row["Street / Block Name"], row["TOWN"]]).lower(),
|
||||
x
|
||||
)[1]
|
||||
) > 90
|
||||
]
|
||||
|
||||
if df.shape[0] == 0:
|
||||
unmatched.append(row["row_id"])
|
||||
continue
|
||||
|
||||
if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
|
||||
" ".join([row["NO"], row["Street / Block Name"]]).lower()
|
||||
)):
|
||||
df = df[
|
||||
df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
|
||||
" ".join([row["NO"], row["Street / Block Name"]]).lower()
|
||||
)
|
||||
]
|
||||
|
||||
if any(
|
||||
df[self.STANDARD_PROPERTY_TYPE].str.contains(
|
||||
row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower()
|
||||
)
|
||||
):
|
||||
# We ignore "block of flats" entries
|
||||
df = df[
|
||||
df[self.STANDARD_PROPERTY_TYPE].str.contains(
|
||||
row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower()
|
||||
) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats")
|
||||
]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
# We have multiple matches
|
||||
raise NotImplementedError("FIX ME")
|
||||
matched.append(
|
||||
{
|
||||
"row_id": row["row_id"],
|
||||
self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0],
|
||||
}
|
||||
)
|
||||
|
||||
self.standardised_asset_list = self.standardised_asset_list.drop(columns="house_no")
|
||||
|
||||
# We match the "UPRN" which is the landlords ID, onto the master sheet
|
||||
matched = pd.DataFrame(matched)
|
||||
master_to_append = master_data[["row_id", install_col, submission_col]].merge(
|
||||
matched, how="left", on="row_id"
|
||||
).rename(
|
||||
columns={
|
||||
"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
|
||||
install_col: "survey_status",
|
||||
submission_col: "submission_date"
|
||||
}
|
||||
)
|
||||
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
|
||||
|
||||
master_surveyed.append(master_to_append)
|
||||
unmatched_df = master_data[
|
||||
master_data["row_id"].isin(unmatched)
|
||||
]
|
||||
submissions_unmatched.append(unmatched_df)
|
||||
|
||||
master_surveyed = pd.concat(master_surveyed)
|
||||
master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
|
||||
|
|
|
|||
|
|
@ -89,6 +89,42 @@ def app():
|
|||
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
|
||||
# - Or the insulation required is loft/cavity (floors should be solid)
|
||||
|
||||
# Bromford
|
||||
data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
|
||||
"Rebuild/Prepared data/")
|
||||
data_filename = "asset_list.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = 'PostCode'
|
||||
fulladdress_column = "FullAddress"
|
||||
address1_column = None
|
||||
address1_method = "house_number_extraction"
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = "ConYear"
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = "AssetTypeDesc"
|
||||
landlord_built_form = "PropTypeDesc"
|
||||
landlord_wall_construction = "Construction type"
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = "Heating Type"
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "Asset"
|
||||
landlord_sap = None
|
||||
outcomes_filename = "outcomes.xlsx"
|
||||
outcomes_sheetname = "Sheet1"
|
||||
outcomes_postcode = "Postcode"
|
||||
outcomes_houseno = "No"
|
||||
outcomes_id = None
|
||||
outcomes_address = "Address"
|
||||
master_filepaths = [
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO "
|
||||
"3 submissions.csv",
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO "
|
||||
"4 submissions.csv",
|
||||
]
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
|
||||
# Torus
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1"
|
||||
data_filename = "Torus Property Asset List - Phase 1.xlsx"
|
||||
|
|
|
|||
|
|
@ -107,5 +107,42 @@ BUILT_FORM_MAPPINGS = {
|
|||
'Semi-detached': 'semi-detached',
|
||||
'Detached': 'detached',
|
||||
'Flat / maisonette': 'unknown',
|
||||
'2014 onwards': 'unknown'
|
||||
'2014 onwards': 'unknown',
|
||||
|
||||
'Semi Detached': 'semi-detached',
|
||||
'End Terraced': 'end-terrace',
|
||||
'Basement': 'basement',
|
||||
'No': 'unknown',
|
||||
'Mid Terrace': 'mid-terrace',
|
||||
'Link Detached': 'detached',
|
||||
'Mid Terraced': 'mid-terrace',
|
||||
'Ground Floor': 'ground floor',
|
||||
'End Terrace': 'end-terrace',
|
||||
'Sheltrd Semi Det': 'semi-detached',
|
||||
'Shop': 'unknown',
|
||||
'Fourth Floor': 'mid-floor',
|
||||
'Terraced': 'mid-terrace',
|
||||
'Leasehold Terr': 'mid-terrace',
|
||||
'Room': 'unknown',
|
||||
'Second Floor': 'mid-floor',
|
||||
'Third Floor': 'mid-floor',
|
||||
'Office': 'unknown',
|
||||
'First Floor Over Arch': 'ground floor',
|
||||
'16-25 IND-PPL': 'unknown',
|
||||
'Seventh Floor': 'top-floor',
|
||||
'Sheltered': 'unknown',
|
||||
'Shelt Bung End': 'end-terrace',
|
||||
'Room In Shared Accommodation': 'unknown',
|
||||
'Sheltred Bung Terrace': 'mid-terrace',
|
||||
'Garage In Block': 'unknown',
|
||||
'First Floor': 'ground floor',
|
||||
'First Floor Over Garage': 'ground floor',
|
||||
'Leasehold': 'unknown',
|
||||
'Sheltred Bung': 'unknown',
|
||||
'Garage': 'unknown',
|
||||
'Sixth Floor': 'top-floor',
|
||||
'Sheltered Bung': 'semi-detached',
|
||||
'Guest': 'unknown',
|
||||
'Fifth Floor': 'mid-floor'
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,8 @@ STANDARD_HEATING_SYSTEMS = {
|
|||
'gas combi boiler',
|
||||
'unknown',
|
||||
"electric ceiling",
|
||||
"electric underfloor"
|
||||
"electric underfloor",
|
||||
"no heating"
|
||||
}
|
||||
|
||||
HEATING_MAPPINGS = {
|
||||
|
|
@ -87,7 +88,7 @@ HEATING_MAPPINGS = {
|
|||
'Heat pump (air) Electricity': 'air source heat pump',
|
||||
'Room heaters Electricity': 'electric radiators',
|
||||
'Room heaters Oil': 'room heaters',
|
||||
'No heating system ND': 'unknown',
|
||||
'No heating system ND': 'no heating',
|
||||
'Heat pump (wet) Electricity': 'ground source heat pump',
|
||||
'Room heaters Biomass': 'room heaters',
|
||||
'ND Solid fuel': 'unknown',
|
||||
|
|
@ -98,11 +99,11 @@ HEATING_MAPPINGS = {
|
|||
'Storage heating Electricity': 'electric storage heaters',
|
||||
'ND Electricity': 'unknown',
|
||||
'Community heating Community (non-gas)': 'district heating',
|
||||
'No heating system N/A': 'unknown',
|
||||
'No heating system N/A': 'no heating',
|
||||
'Boiler Solid fuel': 'boiler - other fuel',
|
||||
'Community heating Community (mains gas)': 'communal gas boiler',
|
||||
'Boiler Biomass': 'boiler - other fuel',
|
||||
'No heating system Mains gas': 'unknown',
|
||||
'No heating system Mains gas': 'no heating',
|
||||
|
||||
'Storage heaters': 'electric storage heaters',
|
||||
'Air Source': 'air source heat pump',
|
||||
|
|
@ -170,5 +171,36 @@ HEATING_MAPPINGS = {
|
|||
'Heat pump (wet)': 'air source heat pump',
|
||||
'Electric ceiling heating': 'electric ceiling',
|
||||
'Electric under floor heating': 'electric underfloor',
|
||||
'Community heating': 'district heating'
|
||||
'Community heating': 'district heating',
|
||||
|
||||
'Wet - Radiators Air Source Heat Pump': 'air source heat pump',
|
||||
'Wet - Radiators Electric': 'electric boiler',
|
||||
'Storage Heaters': 'high heat retention storage heaters',
|
||||
'Wet - Radiators Oil': 'oil boiler',
|
||||
'Communal Wet - Radiators Gas': 'communal gas boiler',
|
||||
'Electric - Storage/Panel Heaters Electric': 'electric storage heaters',
|
||||
'Gas Central Heating': 'gas combi boiler',
|
||||
'Wet - Radiators Solar': 'other',
|
||||
'Electric - Storage/Panel Heaters LPG': 'electric storage heaters',
|
||||
'No Heating Solid': 'no heating',
|
||||
'Wet - Underfloor Gas': 'gas condensing boiler',
|
||||
'No Heating Electric': 'no heating',
|
||||
'Oil Fired Central Heating': 'oil boiler',
|
||||
'Warm Air Gas': 'other',
|
||||
'Communal Boilers': 'communal gas boiler',
|
||||
'Wet - Radiators Gas': 'gas combi boiler',
|
||||
'Wet - Radiators Solid': 'solid fuel',
|
||||
'Wet - Radiators LPG': 'other',
|
||||
'No Heating Gas': 'no heating',
|
||||
'No Heating': 'no heating',
|
||||
'Panel Heaters': 'electric radiators',
|
||||
'Rointe Electric Heating': 'electric storage heaters',
|
||||
'Underfloor Heating': 'electric underfloor',
|
||||
'Air Source Heating': 'air source heat pump',
|
||||
'Warm Air Electric': 'other',
|
||||
'Communal Wet - Radiators Electric': 'communal gas boiler',
|
||||
'Wet - Underfloor Solar': 'other',
|
||||
'No Heating Required Gas': 'unknown',
|
||||
'Electric - Storage/Panel Heaters Gas': 'electric storage heaters',
|
||||
'Electric - Storage/Panel Heaters Solid': 'electric storage heaters'
|
||||
}
|
||||
|
|
|
|||
|
|
@ -151,5 +151,32 @@ PROPERTY_MAPPING = {
|
|||
'Flat: Enclosed End Terrace: Mid Floor': 'flat',
|
||||
'Flat: Enclosed End Terrace: Ground Floor': 'flat',
|
||||
'Flat: Enclosed Mid Terrace: Top Floor': 'flat',
|
||||
'2013 onwards': 'unknown'
|
||||
'2013 onwards': 'unknown',
|
||||
|
||||
'House 2 Storey': 'house',
|
||||
'Bung': 'bungalow',
|
||||
'House 3 Storey': 'house',
|
||||
'Shared Flat': 'flat',
|
||||
'd': 'unknown',
|
||||
'Mais': 'maisonette',
|
||||
'e': 'unknown',
|
||||
'Shared House': 'house',
|
||||
'House 4 Storey': 'house',
|
||||
'Shared Bungalow': 'bungalow',
|
||||
'Detch': 'house',
|
||||
'Shop': 'other',
|
||||
'Terr': 'house',
|
||||
'Terrace': 'house',
|
||||
'Description': 'unknown',
|
||||
'Hse': 'house',
|
||||
'Room': 'other',
|
||||
'Office': 'other',
|
||||
'Room In Shared Accommodation': 'other',
|
||||
'Apartment': 'flat',
|
||||
'm': 'unknown',
|
||||
'Garage': 'other',
|
||||
'Parking Space': 'other',
|
||||
'Community Centre': 'other',
|
||||
'Communal Facility': 'other',
|
||||
'Semi': 'house'
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,5 +22,6 @@ ROOF_CONSTRUCTION_MAPPINGS = {
|
|||
'ND (inferred)': 'unknown',
|
||||
'2018 onwards': 'unknown',
|
||||
'Pitched (vaulted ceiling)': 'pitched insulated',
|
||||
np.nan: "unknown"
|
||||
np.nan: "unknown",
|
||||
None: "unknown"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -157,5 +157,14 @@ WALL_CONSTRUCTION_MAPPINGS = {
|
|||
'Timber frame': 'timber frame unknown insulation',
|
||||
'2017 onwards': 'new build - average thermal transmittance',
|
||||
'ND (inferred)': 'unknown',
|
||||
'Flat / maisonette': 'other'
|
||||
'Flat / maisonette': 'other',
|
||||
|
||||
'Other': 'other',
|
||||
'Timber Frame': 'timber frame unknown insulation',
|
||||
'Cavity Wall': 'cavity unknown insulation',
|
||||
'Non-Traditional': 'system built',
|
||||
'PRC': 'system built',
|
||||
'Cross Wall': 'system built',
|
||||
'Solid Wall': 'solid brick unknown insulation',
|
||||
'Traditional': 'other'
|
||||
}
|
||||
|
|
|
|||
|
|
@ -107,7 +107,10 @@ class Property:
|
|||
# cost and instead, provide a message that the measure has already been installed
|
||||
|
||||
self.already_installed = ast.literal_eval(already_installed['already_installed']) if already_installed else []
|
||||
self.non_invasive_recommendations = non_invasive_recommendations
|
||||
self.non_invasive_recommendations = (
|
||||
non_invasive_recommendations['recommendations'] if
|
||||
non_invasive_recommendations else []
|
||||
)
|
||||
# This is a list of measures that have been recommended for the property
|
||||
if isinstance(measures, list):
|
||||
self.measures = measures
|
||||
|
|
|
|||
|
|
@ -83,7 +83,8 @@ class PlanTriggerRequest(BaseModel):
|
|||
exclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1)
|
||||
inclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1)
|
||||
# This is a list of measures that we want to be included, if they are options
|
||||
required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1)
|
||||
# Default to empty
|
||||
required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=[], min_length=1)
|
||||
|
||||
scenario_name: Optional[str] = ""
|
||||
multi_plan: Optional[bool] = False
|
||||
|
|
|
|||
192
etl/customers/bromford/data_cleanup.py
Normal file
192
etl/customers/bromford/data_cleanup.py
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
"""
|
||||
12th April 2025
|
||||
This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a
|
||||
standardised asset list
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# Step 1
|
||||
# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with
|
||||
# comprehensive inspections
|
||||
|
||||
# Primary asset list
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset "
|
||||
"List.xlsx",
|
||||
sheet_name="Asset List"
|
||||
)
|
||||
|
||||
#
|
||||
inspections_1 = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
|
||||
"MDS.xlsx",
|
||||
sheet_name="Data list"
|
||||
)
|
||||
inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip()
|
||||
|
||||
inspections_2 = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
|
||||
"MERLIN LANE.xlsx",
|
||||
sheet_name="Report"
|
||||
)
|
||||
inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1]
|
||||
inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ")
|
||||
|
||||
inspections_3 = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
|
||||
"SEVERN VALE - KLARKE.xlsx",
|
||||
sheet_name="Asset report"
|
||||
)
|
||||
|
||||
inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"]
|
||||
|
||||
# On inspections 3, we have multiple sheets which describe the heating
|
||||
heating_systems = []
|
||||
for sheet_name in [
|
||||
"Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating",
|
||||
"Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating",
|
||||
"Communal Boilers", "Panel Heaters"
|
||||
]:
|
||||
df = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
|
||||
"Rebuild/Inspections/BROMFORD "
|
||||
"SEVERN VALE - KLARKE.xlsx",
|
||||
sheet_name=sheet_name
|
||||
)
|
||||
df = df[["UPRN"]]
|
||||
df["Heating Type"] = sheet_name
|
||||
heating_systems.append(df)
|
||||
|
||||
heating_systems = pd.concat(heating_systems)
|
||||
# We have no clue which one is correct, we have some dupes
|
||||
heating_systems = heating_systems.drop_duplicates("UPRN")
|
||||
heating_systems = heating_systems.rename(columns={"UPRN": "Asset"})
|
||||
heating_systems["Asset"] = heating_systems["Asset"].astype(int)
|
||||
|
||||
inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset")
|
||||
|
||||
# Create a consolidated inspections sheet
|
||||
inspections = pd.concat(
|
||||
[
|
||||
inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
|
||||
inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
|
||||
inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
|
||||
]
|
||||
)
|
||||
|
||||
inspections_address_data = pd.concat(
|
||||
[
|
||||
inspections_1[
|
||||
["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ]
|
||||
],
|
||||
inspections_2[
|
||||
['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode']
|
||||
].rename(columns={"Postcode": "PostCode"}),
|
||||
inspections_3[
|
||||
['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType']
|
||||
].rename(
|
||||
columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"}
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# Remove some error values
|
||||
inspections = inspections[~inspections["Asset"].isin(
|
||||
[
|
||||
"They're all green partial fill they're all green this",
|
||||
"South Staffordshire District Council",
|
||||
'Blk Milton Crt F9-10, Perton, Wolverhampton'
|
||||
]
|
||||
)]
|
||||
|
||||
inspections["Asset"] = inspections["Asset"].astype(str)
|
||||
asset_list["Asset"] = asset_list["Asset"].astype(str)
|
||||
inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str)
|
||||
inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True)
|
||||
|
||||
# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is
|
||||
# populated
|
||||
inspections = inspections.sort_values(by='WFT Findings', na_position='last')
|
||||
inspections = inspections.drop_duplicates(subset='Asset', keep='first')
|
||||
|
||||
# We have dupes in the asset list
|
||||
asset_list = asset_list.drop_duplicates("Asset")
|
||||
|
||||
# Merge on
|
||||
missed_asset_ids = inspections[
|
||||
~inspections["Asset"].isin(asset_list["Asset"].values)
|
||||
]["Asset"].values
|
||||
|
||||
missed_assets = inspections_address_data[
|
||||
inspections_address_data["Asset"].isin(missed_asset_ids)
|
||||
]
|
||||
missed_assets = missed_assets.drop_duplicates("Asset")
|
||||
|
||||
# We produce a larger asset list
|
||||
asset_list = pd.concat([asset_list, missed_assets])
|
||||
|
||||
asset_list = asset_list.merge(
|
||||
inspections, how="left", on="Asset"
|
||||
)
|
||||
asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note")
|
||||
|
||||
# Store
|
||||
# asset_list.to_excel(
|
||||
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
|
||||
# "data/asset_list.xlsx"
|
||||
# )
|
||||
|
||||
# We now prepare outcomes into a single file
|
||||
pv_outcomes = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV "
|
||||
"Outcomes.csv",
|
||||
encoding='cp1252'
|
||||
)
|
||||
pv_outcomes["measure_type"] = "solar"
|
||||
|
||||
other_outcomes = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) "
|
||||
"15.04.2024.xlsx",
|
||||
sheet_name="ECO4 & GBIS",
|
||||
header=1
|
||||
)
|
||||
other_outcomes["measure_type"] = "cwi"
|
||||
|
||||
combined_outcomes = pd.concat(
|
||||
[
|
||||
other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename(
|
||||
columns={
|
||||
"NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing",
|
||||
"OUTCOMES": "Outcome", "NOTES": "Notes"
|
||||
}
|
||||
),
|
||||
pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]]
|
||||
]
|
||||
)
|
||||
|
||||
# Store
|
||||
# combined_outcomes.to_excel(
|
||||
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
|
||||
# "data/outcomes.xlsx"
|
||||
# )
|
||||
|
||||
# Submissions sheet -
|
||||
eco3_submissions = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv",
|
||||
encoding='cp1252'
|
||||
)
|
||||
# Get rid of the unnamed columns
|
||||
unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c]
|
||||
eco3_submissions = eco3_submissions.drop(columns=unnamed_columns)
|
||||
# Store
|
||||
eco3_submissions.to_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv",
|
||||
index=False
|
||||
)
|
||||
|
||||
eco4_submissions = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv",
|
||||
)
|
||||
|
||||
same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns]
|
||||
|
|
@ -4,7 +4,7 @@ from dotenv import load_dotenv
|
|||
from utils.s3 import save_csv_to_s3
|
||||
from etl.find_my_epc.AssetListEpcData import AssetListEpcData
|
||||
|
||||
PORTFOLIO_ID = 140
|
||||
PORTFOLIO_ID = 141
|
||||
USER_ID = 8
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
|
|
@ -19,17 +19,20 @@ def app():
|
|||
|
||||
asset_list = [
|
||||
{
|
||||
"address": "Brow Cottage",
|
||||
"postcode": "YO18 7PZ",
|
||||
"uprn": 10007630752,
|
||||
"property_type": "House",
|
||||
"built_form": "Semi-Detached",
|
||||
"address": "196 Merrow Street",
|
||||
"postcode": "SE17 2NP",
|
||||
"uprn": 200003423454,
|
||||
"patch": True
|
||||
},
|
||||
{
|
||||
"address": "Wyburn",
|
||||
"postcode": "DT1 2LL",
|
||||
"uprn": 100040630290
|
||||
"address": "65 Liverpool Grove",
|
||||
"postcode": "SE17 2HP",
|
||||
"uprn": 200003423194
|
||||
},
|
||||
{
|
||||
"address": "2 Brettell Street",
|
||||
"postcode": "SE17 2NZ",
|
||||
"uprn": 200003423607
|
||||
},
|
||||
]
|
||||
asset_list = pd.DataFrame(asset_list)
|
||||
|
|
@ -71,12 +74,16 @@ def app():
|
|||
|
||||
valuation_data = [
|
||||
{
|
||||
"valuation": 469_000,
|
||||
"uprn": 10007630752,
|
||||
"valuation": 339_000,
|
||||
"uprn": 200003423454,
|
||||
},
|
||||
{
|
||||
"valuation": 373_000,
|
||||
"uprn": 100040630290
|
||||
"valuation": 374_000,
|
||||
"uprn": 200003423194
|
||||
},
|
||||
{
|
||||
"valuation": 719_000,
|
||||
"uprn": 200003423607
|
||||
},
|
||||
]
|
||||
# Store valuation data to s3
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import re
|
||||
import openpyxl
|
||||
import Levenshtein
|
||||
from fuzzywuzzy import fuzz
|
||||
from pathlib import Path
|
||||
import msgpack
|
||||
from datetime import datetime
|
||||
|
|
@ -2771,7 +2771,8 @@ class DataLoader:
|
|||
match_to = [x.replace(" ", "") for x in match_to]
|
||||
|
||||
# Perform matching between full key and match_to
|
||||
distances = [Levenshtein.distance(matching_string, s) for s in match_to]
|
||||
distances = [100 - fuzz.ratio(matching_string, s) for s in match_to]
|
||||
|
||||
best_match_index = distances.index(min(distances))
|
||||
# We might want to consider a threshold for the distance, however for the momeny,
|
||||
# we don't consider this for the moment
|
||||
|
|
|
|||
|
|
@ -635,7 +635,7 @@ class Recommendations:
|
|||
# By limiting here, we don't change the value in current_phase_values. This means that the
|
||||
# future recommendations won't have an impact that is too large
|
||||
li_sap_limit = RoofRecommendations.get_loft_insulation_sap_limit(
|
||||
property_instance.data["roof-energy-eff"], property_instance.data["extension-count"]
|
||||
property_instance.data["roof-energy-eff"], property_instance.roof["insulation_thickness"]
|
||||
)
|
||||
if li_sap_limit is not None:
|
||||
property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit)
|
||||
|
|
|
|||
|
|
@ -64,16 +64,16 @@ class RoofRecommendations:
|
|||
)
|
||||
|
||||
@classmethod
|
||||
def get_loft_insulation_sap_limit(cls, roof_energy_eff, extension_count):
|
||||
def get_loft_insulation_sap_limit(cls, roof_energy_eff, existing_thickness):
|
||||
"""
|
||||
Get the SAP limit for loft insulation
|
||||
:param roof_energy_eff:
|
||||
:return:
|
||||
"""
|
||||
|
||||
if extension_count == 0:
|
||||
# No limit
|
||||
return None
|
||||
if str(existing_thickness).isdigit():
|
||||
if float(existing_thickness) >= 250:
|
||||
return 0
|
||||
|
||||
if roof_energy_eff in ["Good", "Very Good"]:
|
||||
return 1
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue