updating data pull code

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-12 10:14:14 +00:00
parent 77844c625e
commit 61544d01db
5 changed files with 274 additions and 74 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>

View file

@ -19,9 +19,9 @@ def app():
asset_list = [
{
"address": "49 Brailsford Road",
"postcode": "M14 6PT",
"uprn": 77145666,
"address": "19 Hillcrest Court",
"postcode": "IP21 4YJ",
"uprn": 2630134524,
}
]
asset_list = pd.DataFrame(asset_list)
@ -52,8 +52,8 @@ def app():
valuation_data = [
{
"uprn": 77145666,
"valuation": 337_000
"uprn": 2630134524,
"valuation": 96_000
}
]
# Store valuation data to s3

View file

@ -368,9 +368,10 @@ def app():
additional_properties2 = additional_properties[[
"Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
"Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
'Same Postcode as Installed under ECO3'
'Same Postcode as Installed under ECO3', "Organisation Reference",
]].rename(
columns={
"Organisation Reference": "Org. ref.",
"SAP": "Parity - Predicted SAP",
"SAP Band": "Parity - Predicted SAP Band",
"Age": "Parity - Build Age",
@ -387,7 +388,12 @@ def app():
)
# Combine the data:
full_dataset = pd.concat([stonewater_cavity_properties, additional_properties2])
stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
)
full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
full_dataset = full_dataset.drop(columns=['Osm. ID'])
# We not define the priority list for non-intrusives
full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
@ -414,7 +420,7 @@ def app():
df.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
"revised list.xlsx",
"revised list.csv",
index=False
)

View file

@ -1,7 +1,6 @@
import os
import time
import pickle
from BaseUtility import Definitions
import pandas as pd
import numpy as np
from tqdm import tqdm
@ -17,6 +16,10 @@ from recommendations.recommendation_utils import (
estimate_number_of_floors
)
from etl.epc_clean.epc_attributes.attribute_utils import (
extract_thermal_transmittance
)
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -158,6 +161,53 @@ def extract_address1(asset_list, full_address_col, method="first_two_words"):
raise ValueError(f"Method {method} not recognized")
def process_age_band(x, year_built_column):
year_built = float(x[year_built_column])
if pd.isnull(x["Property Age Band"]) or (
x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
) or pd.isnull(year_built):
return "No EPC Age Band"
# We check if we have a numeric data
if x["Property Age Band"].isdigit():
if year_built == float(x["Property Age Band"]):
return "EPC Age Band Matches Year Built"
if year_built > float(x["Property Age Band"]):
return "EPC Age Band is older than Year Built"
if year_built < float(x["Property Age Band"]):
return "EPC Age Band is newer than Year Built"
# Handle specific case
if x["Property Age Band"] == "England and Wales: 2007 onwards":
if year_built >= 2007:
return "EPC Age Band Matches Year Built"
if year_built < 2007:
return "EPC Age Band is older than Year Built"
if x["Property Age Band"] == "England and Wales: before 1900":
if year_built < 1900:
return "EPC Age Band Matches Year Built"
if year_built >= 1900:
return "EPC Age Band is newer than Year Built"
# Age band will be formatted as such:
# 'England and Wales: {upper date}-{lower date}'
# so we extract the lower and upper date
age_band = x["Property Age Band"].split(": ")[1]
lower_date, upper_date = age_band.split("-")
if year_built <= float(upper_date) and year_built <= float(upper_date):
return "EPC Age Band Matches Year Built"
if year_built > float(upper_date):
return "EPC Age Band is older than Year Built"
if year_built < float(upper_date):
return "EPC Age Band is newer than Year Built"
raise Exception("Should not reach here")
def app():
"""
This app is EPC pulling data for some properties owned by Livewest
@ -179,17 +229,47 @@ def app():
Heat loss calculations
EPC recommendations
Property UPRN
"""
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People"
DATA_FILENAME = "Regulated Stock - Do Not Change (06.06.24).xlsx"
SHEET_NAME = "Assets 1"
# TODO:
# For cavity work:
# - Flag any entries that have a different wall type between non-intrusive data against EPC
# - Worth double checking entries that have a difference in wall construction
# - Look at anything that is flagged as an empty cavity but the EPC data says its a filled cavity
# - Look at the current EPC scores - Anything that is C75 or above, especially if its assumed no insulation
# - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
# are less than C75
# - Flag anything pre SAP2012
# - Flag anything over 5 years old
# - Look at year built vs age band
#
# For Solar:
# - Discount any that have solar PV - based on non-intrusives and from the inspections team
# - In the heating, discount anything that isnt ashp, ghsp, hhrs, electric storage - possibly homes with
# electric room heaters but it might need to be an EPC E
# - Fabric - check the floor, wall and roof:
# - Filled or empty cavity is good
# - Insulated solid/timber/system built is good
# - SCIS/CEG needs solid floors
# - JJC dont care
# - Anything with a loft 200 or below
# - Anything C75 and above wont qualify
# - Insulated loft = 200mm
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
# - Or the insulation required is loft/cavity (floors should be solid)
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Eastlight"
DATA_FILENAME = "Eastlight addresses potential PV data pull required.xlsx"
SHEET_NAME = "Sheet1"
POSTCODE_COLUMN = "Postcode"
FULLADDRESS_COLUMN = "Address"
ADDRESS1_COLUMN = "AddressLine1"
FULLADDRESS_COLUMN = None
ADDRESS1_COLUMN = "HouseName"
ADDRESS1_METHOD = None
ADDRESS_COLS_TO_CONCAT = []
ADDRESS_COLS_TO_CONCAT = [
"HouseName", "Block", "Address1"
]
MISSING_POSTCODES_METHOD = None
PROPERTY_YEAR_BUILT = 'Built In Year'
# Maps addresses to uprn in problematic cases
MANUAL_UPRN_MAP = {}
@ -216,6 +296,7 @@ def app():
asset_list[col] = asset_list[col].astype(str)
asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False)
asset_list[col] = asset_list[col].str.strip()
if ADDRESS1_COLUMN is None:
ADDRESS1_COLUMN = "address1_extracted"
@ -226,7 +307,15 @@ def app():
if FULLADDRESS_COLUMN is None:
FULLADDRESS_COLUMN = "fulladdress_extracted"
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
# Sometimes, some of the columns are empty, so we need to remove them
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(
lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1
)
# We clean up portential non-breaking spaces, and double spaces
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str)
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False)
# We check for duplicated addresses
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
@ -237,8 +326,10 @@ def app():
asset_list = asset_list.drop(columns=["deduper"])
# We chunk up this data into 5000 rows at a time
# Create the chunks directory
if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")):
os.makedirs(os.path.join(DATA_FOLDER, "Chunks"))
chunk_size = 5000
epc_data = []
errors = []
no_epc = []
skip = None # Used to skip already completed chunks
@ -275,9 +366,19 @@ def app():
# Store the chunk locally as a csv
pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
epc_data.extend(epc_data_chunk)
# We read in and concatenate the created created chunks
chunks_folder = os.path.join(DATA_FOLDER, "Chunks")
# List the contents
chunk_files = os.listdir(chunks_folder)
epc_data = []
for file in chunk_files:
csv_data = pd.read_csv(os.path.join(chunks_folder, file))
# We need to convert the recommendations back to a list
csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
epc_data.append(csv_data)
epc_df = pd.DataFrame(epc_data)
epc_df = pd.concat(epc_data)
# We expand out the recommendations
recommendations_df = epc_df[["row_id", "recommendations"]]
@ -302,9 +403,9 @@ def app():
transformed_data.append(row_data)
transformed_df = pd.DataFrame(transformed_data)
# Drop the column that is ""
if "" in transformed_df.columns:
transformed_df = transformed_df.drop(columns=[""])
# At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
# recommendations
transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]
# Get the find my epc data
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
@ -342,7 +443,9 @@ def app():
"energy-consumption-current", # kwh/m2
"photo-supply",
]
].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"})
].rename(
columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
)
asset_list = asset_list.merge(
epc_df,
@ -422,6 +525,138 @@ def app():
axis=1
)
# We produce some additional fields
# 1) Is the SAP rating below C75
asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75
# 2) Flag anything where the EPC is older than 5 years
cutoff_year = pd.Timestamp.now().year - 5
asset_list[f"EPC is pre {cutoff_year}"] = (
pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year
)
# 3) If we have year in the asset list, we flag entries where the built year is different from the
# EPC Age band
if PROPERTY_YEAR_BUILT is not None:
asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
)
# 4) Flag properties that look like they're good candidates for solar installs
# Firstly, flag if the fabric is completely done
insulated_wall_substrings = [
", insulated", "with external insulation", "with internal insulation", "filled cavity"
]
insulated_roof_substrings = [
"(another dwelling above)", "limited insulation", "(other premises above)",
", no insulation",
]
def check_solar_insulation_conditions(x):
if pd.isnull(x["Wall Construction"]):
return None
if "average thermal transmittance" in x["Wall Construction"].lower():
# We extract out the u-values
wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"]
roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"]
floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"]
roof_uvalue = 0 if roof_uvalue is None else roof_uvalue
floor_uvalue = 0 if floor_uvalue is None else floor_uvalue
# We apply some cutoffs
if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7:
return "Walls, Roof and Floor have U-values below 0.7"
return "Confirm U-values"
walls_insulated = any(
insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings
)
roof_is_numeric = False
if str(x["Roof Insulation Thickness"]).isdigit():
roof_is_numeric = True
roof_insulated = int(x["Roof Insulation Thickness"]) >= 200
else:
roof_insulated = any(
insulated_substring in x["Roof Construction"].lower() for insulated_substring in
insulated_roof_substrings
)
floor_is_solid = "solid" in x["Floor Construction"].lower()
if walls_insulated and roof_insulated and floor_is_solid:
return "Walls Insulated, Roof Insulated, Floor Solid"
if walls_insulated and floor_is_solid and roof_is_numeric:
return "Walls Insulated, Floor Solid, Loft need top-up"
return "Not Fully Insulated or no data"
asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1)
asset_list["Good Solar Candidate"] = (
asset_list["SAP Rating is 75 and below"] &
~asset_list["Has Solar PV"] &
(
asset_list["Heating Type"].isin(
[
"Electric storage heaters",
"Room heaters, electric",
]
) | asset_list["Heating Type"].str.contains("heat pump", case=False)
) & (
asset_list["Solar Fabric Condition"].isin(
[
"Walls Insulated, Roof Insulated, Floor Solid",
"Walls, Roof and Floor have U-values below 0.7",
"Walls Insulated, Floor Solid, Loft need top-up"
]
)
)
)
def flat_analysis(asset_list):
# We need to deduce the building name - we strip out the house number
def extract_building_name(x):
# TODO: This doesn't really work
if pd.isnull(x):
return None
house_no = SearchEpc.get_house_number(address=x, postcode=None)
if house_no:
return x.replace(house_no, "").strip()
return x.split(",")[0].strip()
# We want to deduce if flats have 50% of the properties below C75
# We group by postcode and property type
grouped = asset_list.groupby(["Postcode", "Property Type"])
flat_data = []
for _, group in grouped:
if "flat" in group["Property Type"].str.lower().values:
num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
num_below_c75 = group["SAP score on register"].lt(75).sum()
flat_data.append(
{
"Postcode": group["Postcode"].iloc[0],
"Property Type": "Flat",
"Number of Flats with EPC": num_flats,
"Number of Flats below C75": num_below_c75,
"Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
}
)
flat_data = pd.DataFrame(flat_data)
return flat_data
flat_data = flat_analysis(asset_list)
# For all of the columns in transformed_df, prefix with "Recommendation: "
for col in transformed_df.columns:
if col == "row_id":
@ -436,54 +671,13 @@ def app():
asset_list = asset_list.drop(columns=["row_id", "index"])
# Store as an excel
filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx"
asset_list.to_excel(filename, index=False)
filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
with pd.ExcelWriter(filename) as writer:
asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
matches_review = asset_list[
[FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
]
import requests
import base64
API_KEY = "c4afe10370d67eeaa44f067dd37d115263f6c90e"
URL = "https://epc.opendatacommunities.org/api/v1/domestic/search?size=20"
email = "itskruel@gmail.com"
AUTH_TOKEN = base64.b64encode(
":".join([email, API_KEY]).encode("utf-8")
)
AUTH_TOKEN = "aXRza3J1ZWxAZ21haWwuY29tOmM0YWZlMTAzNzBkNjdlZWFhNDRmMDY3ZGQzN2QxMTUyNjNmNmM5MGU="
headers = {
"Authorization": "Basic {auth_token}".format(auth_token=AUTH_TOKEN),
"Accept": "application/json",
}
params = {
"UPRN": "766024370"
}
response = requests.get(url="https://epc.opendatacommunities.org/api/v1/domestic/search?size=20&UPRN=766024370",
headers=headers)
response.json()
data = response.json()
from operator import itemgetter
newest = sorted(data["rows"], key=itemgetter('lodgement-date'))
data["rows"][0]["lodgement-date"]
data["rows"][1]["lodgement-date"]
import pandas as pd
df = pd.DataFrame(data["rows"])
df["uprn"].values[2]
df[df["uprn"] == "3455035000"]["property-type"]
from backend.apis.GoogleSolarApi import GoogleSolarApi