diff --git a/backend/Property.py b/backend/Property.py index ab8930c5..79108dc1 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1204,7 +1204,7 @@ class Property: return False suitable_house = self.data["property-type"] == "House" and self.data["built-form"] in [ - "Detached", "Semi-Detached", + "Detached", "Semi-Detached", "End-Terrace", ] suitable_bungalow = self.data["property-type"] == "Bungalow" and self.data["built-form"] in [ diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index f4924c71..3b91a461 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -543,7 +543,11 @@ async def trigger_plan(body: PlanTriggerRequest): representative_recommendations = {} for p in tqdm(input_properties): recommender = Recommendations( - property_instance=p, materials=materials, exclusions=body.exclusions, inclusions=body.inclusions + property_instance=p, + materials=materials, + exclusions=body.exclusions, + inclusions=body.inclusions, + default_u_values=body.default_u_values ) property_recommendations, property_representative_recommendations = recommender.recommend() diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index 0d58c7e9..4b43db80 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -89,6 +89,9 @@ class PlanTriggerRequest(BaseModel): # if False, allows optimisation to be switched off optimise: Optional[bool] = True + # If True, uses default u-values for models + default_u_values: Optional[bool] = True + _allowed_goals = {"Increasing EPC"} _allowed_housing_types = {"Social", "Private"} diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index cbcebb9f..68432577 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -103,6 +103,8 @@ class PropertyValuation: # Vander Elliot Intrusive surveys 12103116: 1_537_000, 12103117: 1_404_000, + # GLA Proposal + 100020606627: 409_000 } # We base our valuation uplifts on a number of sources diff --git a/etl/customers/gla/__init__.py b/etl/customers/gla/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/customers/gla/example_model_outputs.py b/etl/customers/gla/example_model_outputs.py new file mode 100644 index 00000000..e239c43d --- /dev/null +++ b/etl/customers/gla/example_model_outputs.py @@ -0,0 +1,38 @@ +import pandas as pd +from utils.s3 import save_csv_to_s3 + +asset_list = [ + { + "address": "4, King Henrys Drive", + "postcode": "CR0 0PA" + }, +] +portfolio_id = 110 +user_id = 8 + +asset_list = pd.DataFrame(asset_list) + +filename = f"{user_id}/{portfolio_id}/asset_list.csv" +save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename +) + +body1 = { + "portfolio_id": str(portfolio_id), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "A", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": "", + "inclusions": [ + "cavity_wall_insulation", "loft_insulation", "air_source_heat_pump", "solar_pv" + ], + "budget": None, + "scenario_name": "Whole House", + "multi_plan": False, +} +print(body1) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py new file mode 100644 index 00000000..f6a87af1 --- /dev/null +++ b/etl/customers/gla/proposal_investigation.py @@ -0,0 +1,173 @@ +""" +This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant +""" + +import inspect +import requests +import json +import pandas as pd +from pathlib import Path +from etl.ownership.Ownership import Ownership + +postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes' +) +# Take just the first three columns +postcodes = postcodes[ + ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2'] +] + +postcodes.columns = ['postcode', 'Local Authority', 'London Borough?'] +# Drop the first row +postcodes = postcodes.drop([0, 1]) +# Take just the London Boroughs +postcodes = postcodes[postcodes["London Borough?"] == "Yes"] +# Since there are a large number of potcodes (425k), let's just take a few examples +# Take postcodes that begin with "BN15" +# postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] + +# The Local Authority is Adur, so let's get the EPC data for this area +# epc_data = pd.read_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur" +# "/certificates.csv", low_memory=False +# ) +# # Filter on these postcodes +# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())] +# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] +# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE +# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"]) +# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN") +# +# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total +# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index() +# ratings_distribution.columns = ["Rating", "Count"] +# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100 + +# Can we identify the owners of these units so we can contact them? + +file_src = inspect.getfile(lambda x: None) +DATA_DIRECTORY = Path(file_src).parent / "local_data" / "all-domestic-certificates" +epc_paths = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] +epc_paths = [str(entry / "certificates.csv") for entry in epc_paths] + +ownership = Ownership( + epc_paths=epc_paths, + domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv", + overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv", + land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv", + project_name="gla-proposal", + bucket="retrofit-data-dev", + average_property_value=0, + portfolio_value=0, + excluded_owners=[], + excluded_uprns=[], + save=True +) + +# Data will be found at ownership/gla-proposal +ownership.source_epc_properties(column_filters={}, postcodes=postcodes["postcode"].str.lower().tolist()) + +# Step 2: Get company ownership data +ownership.load_company_ownership() + +# Step 3: Prepare data for matching +ownership.prepare_for_matching() + +# Step 4: Match EPC data to ownership data +ownership.match() + +from utils.s3 import save_excel_to_s3, read_excel_from_s3 + +# Save the data to S3 +# save_excel_to_s3( +# df=ownership.matched_addresses, +# bucket_name=ownership.bucket, +# file_key=ownership.matched_addresses_pre_filter_filepath +# ) + +# Read in matches +matches = read_excel_from_s3( + bucket_name=ownership.bucket, + file_key="ownership/gla-proposal/2024-10-10 19:02:34.131365/matched_addresses_pre_filter.xlsx", + header_row=0 +) + +# We have the matches, which we now need to match to the postcodes +matches = ownership.matched_addresses.copy() +# filter matches on the postcodes we're interested in +matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] +# Remove any social transactions +matches = matches[~matches["TENURE"].isin( + ["Rented (social)", "rental (social)", + "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be " + "used for an existing dwelling", "NO DATA!"]) +] +matches["is_prs"] = matches["TENURE"].isin(["rental (private)", "Rented (private)"]) +# Look at the EPC ratings +epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index() +epc_ratings.columns = ["EPC Rating", "Count"] +epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100 + +# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds +matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])] +# 11,694 properties +matches["epc_postcode"].nunique() +# 6899 + +owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index() +owners_count.columns = ['Owner', 'Owner Registration #', 'Count'] +owners_count = owners_count.sort_values('Count', ascending=False) +owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100 + +# Take an example postal region +matches = matches.sort_values("epc_postcode", ascending=True) +# BR1, BR5 +example = matches[matches["epc_postcode"].str.startswith("CR0 ")].copy() +example = example[example["TENURE"].isin(["rental (private)", "Rented (private)"])] + +pd.set_option('display.max_rows', 500) +pd.set_option('display.max_columns', 500) +pd.set_option('display.width', 1000) +example[ + ["epc_address", "epc_postcode", "CURRENT_ENERGY_RATING", "CURRENT_ENERGY_EFFICIENCY", "Proprietor Name (1)", + "Company Registration No. (1)"] +].head(4) + +ownership.epc_data["UPRN"] = ownership.epc_data["UPRN"].astype(int) +example = example.merge( + ownership.epc_data[["UPRN", "BUILT_FORM", "PROPERTY_TYPE", "WALLS_DESCRIPTION", "ROOF_DESCRIPTION"]], + on="UPRN", + how="left" +) +z = example[example["CURRENT_ENERGY_RATING"] == "E"] +z = z[z["TENURE"].isin(["rental (private)", "Rented (private)"])] + +companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f" + +company_number = example.head(1)["Company Registration No. (1)"].values[0] +url = f'https://api.company-information.service.gov.uk/company/{company_number}' + +# Make the API request +response = requests.get(url, auth=(companies_house_api_key, '')) + +# Check if the request was successful +if response.status_code == 200: + company_data = response.json() + # Pretty-print the fetched data + print(json.dumps(company_data, indent=4)) +else: + print(f"Failed to fetch data. Status code: {response.status_code}") + # Try appending a zero the beginning of the company number + company_number = f"0{company_number}" + url = f'https://api.company-information.service.gov.uk/company/{company_number}' + response = requests.get(url, auth=(companies_house_api_key, '')) + company_data = response.json() + +from pprint import pprint + +pprint(company_data) + +psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control' +psc_response = requests.get(psc_url, auth=(companies_house_api_key, '')) +psc_data = psc_response.json() +pprint(psc_data) diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 3bc4b60d..68dee9ed 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -61,6 +61,7 @@ class Ownership: portfolio_value: float, excluded_owners: List[str] = None, excluded_uprns: List[int] = None, + save=True ): """ @@ -115,6 +116,8 @@ class Ownership: f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx" ) + self.save = save + # Data self.epc_data = None self.ownership_data = None @@ -158,21 +161,22 @@ class Ownership: # Step 5: Match land registry data to existing matches self.match_with_land_registry() # We store this data in s3 before we perform any filtering - save_excel_to_s3( - df=self.matched_addresses, - bucket_name=self.bucket, - file_key=self.matched_addresses_pre_filter_filepath - ) - save_excel_to_s3( - df=self.combined_matching_lookup, - bucket_name=self.bucket, - file_key=self.combined_matching_lookup_pre_filter_filepath - ) + if self.save: + save_excel_to_s3( + df=self.matched_addresses, + bucket_name=self.bucket, + file_key=self.matched_addresses_pre_filter_filepath + ) + save_excel_to_s3( + df=self.combined_matching_lookup, + bucket_name=self.bucket, + file_key=self.combined_matching_lookup_pre_filter_filepath + ) # Prepare the final outputs: self.create_final_matches() - def source_epc_properties(self, column_filters=None): + def source_epc_properties(self, column_filters=None, postcodes=None): """ This function will filter the epc data as specified by column filters, searching across all of the EPC tables :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This @@ -180,6 +184,7 @@ class Ownership: {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that column. If a column is not found in the EPC data, an exception is raised. + :param postcodes: A list of postcodes to filter the data on """ column_filters = {} if column_filters is None else column_filters @@ -203,6 +208,11 @@ class Ownership: else: raise Exception(f"Column {column} not found in data. column_filters is malformed") + if postcodes is not None: + epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)] + if epc_data.empty: + continue + data.append(epc_data) self.epc_data = pd.concat(data, ignore_index=True) @@ -210,12 +220,13 @@ class Ownership: if self.excluded_uprns: self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)] - # We now store the data in s3 - save_excel_to_s3( - df=self.epc_data, - bucket_name=self.bucket, - file_key=self.epc_data_filepath - ) + if self.save: + # We now store the data in s3 + save_excel_to_s3( + df=self.epc_data, + bucket_name=self.bucket, + file_key=self.epc_data_filepath + ) def load_company_ownership(self): """ @@ -484,11 +495,11 @@ class Ownership: house_no = house_no.replace(",", "") if house_no is None: - # It's hard for us to get a reliable match - # filtered = filtered[filtered["Property Address"].str.contains(address["ADDRESS1"])] - # if filtered.shape[0] > 1: - # raise Exception("No valid - maybe we should do levenstein?") - continue + # If the house number is missing, it means that we usually have a named property so we look for an + # exact match on that name + filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())] + if filtered.shape[0] != 1: + continue else: @@ -590,7 +601,8 @@ class Ownership: "CURRENT_ENERGY_RATING", "POSTCODE", "LODGEMENT_DATE", - "TRANSACTION_TYPE" + "TRANSACTION_TYPE", + "TENURE", ] ].rename( columns={ @@ -1002,25 +1014,26 @@ class Ownership: if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique(): raise ValueError("Portfolio properties and epc data don't match") - logger.info("Storing final outpus") - # Store data - save_excel_to_s3( - df=self.portfolio_owners, - bucket_name=self.bucket, - file_key=self.portfolio_owners_filepath, - ) + if self.save: + logger.info("Storing final outpus") + # Store data + save_excel_to_s3( + df=self.portfolio_owners, + bucket_name=self.bucket, + file_key=self.portfolio_owners_filepath, + ) - save_excel_to_s3( - df=self.portfolio_properties, - bucket_name=self.bucket, - file_key=self.portfolio_properties_filepath, - ) + save_excel_to_s3( + df=self.portfolio_properties, + bucket_name=self.bucket, + file_key=self.portfolio_properties_filepath, + ) - save_excel_to_s3( - df=self.portfolio_epc_data, - bucket_name=self.bucket, - file_key=self.portfolio_epc_data_filepath, - ) + save_excel_to_s3( + df=self.portfolio_epc_data, + bucket_name=self.bucket, + file_key=self.portfolio_epc_data_filepath, + ) def get_asset_list(self): """