From 2d7e9a3cc9bb19bb558d9fbdefc495e4b2826e26 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 10 Oct 2024 18:12:29 +0100 Subject: [PATCH 1/5] setting up code for gla proposal --- etl/customers/gla/__init__.py | 0 etl/customers/gla/proposal_investigation.py | 76 +++++++++++++++++++++ etl/ownership/Ownership.py | 19 ++++-- 3 files changed, 88 insertions(+), 7 deletions(-) create mode 100644 etl/customers/gla/__init__.py create mode 100644 etl/customers/gla/proposal_investigation.py diff --git a/etl/customers/gla/__init__.py b/etl/customers/gla/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py new file mode 100644 index 00000000..e36d82b8 --- /dev/null +++ b/etl/customers/gla/proposal_investigation.py @@ -0,0 +1,76 @@ +""" +This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant +""" +from nis import match + +import pandas as pd +from etl.ownership.Ownership import Ownership + +postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes' +) +# Take just the first two columns +postcodes = postcodes[ + ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1'] +] + +postcodes.columns = ['postcode', 'Local Authority'] +# Drop the first row +postcodes = postcodes.drop([0, 1]) +# Since there are a large number of potcodes (425k), let's just take a few examples +# Take postcodes that begin with "BN15" +postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] + +# The Local Authority is Adur, so let's get the EPC data for this area +# epc_data = pd.read_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur" +# "/certificates.csv", low_memory=False +# ) +# # Filter on these postcodes +# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())] +# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] +# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE +# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"]) +# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN") +# +# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total +# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index() +# ratings_distribution.columns = ["Rating", "Count"] +# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100 + +# Can we identify the owners of these units so we can contact them? +ownership = Ownership( + epc_paths=[ + "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223" + "-Adur/certificates.csv" + ], + domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv", + overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv", + land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv", + project_name="gla-proposal", + bucket="retrofit-data-dev", + average_property_value=0, + portfolio_value=0, + excluded_owners=[], + excluded_uprns=[], + save=False +) + +# Data will be found at ownership/gla-proposal +ownership.source_epc_properties(column_filters={}) + +# Step 2: Get company ownership data +ownership.load_company_ownership() + +# Step 3: Prepare data for matching +ownership.prepare_for_matching() + +# Step 4: Match EPC data to ownership data +ownership.match() + +# We have the matches, which we now need to match to the postcodes +matches = ownership.matched_addresses.copy() +# filter matches on the postcodes we're interested in +matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] +# Remove any social transactions +matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])] diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 3bc4b60d..2079391c 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -61,6 +61,7 @@ class Ownership: portfolio_value: float, excluded_owners: List[str] = None, excluded_uprns: List[int] = None, + save=True ): """ @@ -115,6 +116,8 @@ class Ownership: f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx" ) + self.save = save + # Data self.epc_data = None self.ownership_data = None @@ -210,12 +213,13 @@ class Ownership: if self.excluded_uprns: self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)] - # We now store the data in s3 - save_excel_to_s3( - df=self.epc_data, - bucket_name=self.bucket, - file_key=self.epc_data_filepath - ) + if self.save: + # We now store the data in s3 + save_excel_to_s3( + df=self.epc_data, + bucket_name=self.bucket, + file_key=self.epc_data_filepath + ) def load_company_ownership(self): """ @@ -590,7 +594,8 @@ class Ownership: "CURRENT_ENERGY_RATING", "POSTCODE", "LODGEMENT_DATE", - "TRANSACTION_TYPE" + "TRANSACTION_TYPE", + "TENURE", ] ].rename( columns={ From a953a1f0ee215f30cb5a17953f0d8f4b167caa18 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 10 Oct 2024 18:38:08 +0100 Subject: [PATCH 2/5] Improving ownership matching algorithm --- etl/customers/gla/proposal_investigation.py | 29 +++++++++++++++++++++ etl/ownership/Ownership.py | 10 +++---- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py index e36d82b8..57df0554 100644 --- a/etl/customers/gla/proposal_investigation.py +++ b/etl/customers/gla/proposal_investigation.py @@ -74,3 +74,32 @@ matches = ownership.matched_addresses.copy() matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] # Remove any social transactions matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])] + +matches.head() +owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index() +owners_count.columns = ['Owner', 'Owner Registration #', 'Count'] +owners_count = owners_count.sort_values('Count', ascending=False) +owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100 + +companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f" + +import requests +import json + +company_number = "13197205" +url = f'https://api.company-information.service.gov.uk/company/{company_number}' + +# Make the API request +response = requests.get(url, auth=(companies_house_api_key, '')) + +# Check if the request was successful +if response.status_code == 200: + company_data = response.json() + # Pretty-print the fetched data + print(json.dumps(company_data, indent=4)) +else: + print(f"Failed to fetch data. Status code: {response.status_code}") + +psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control' +psc_response = requests.get(psc_url, auth=(companies_house_api_key, '')) +psc_data = psc_response.json() diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 2079391c..52181452 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -488,11 +488,11 @@ class Ownership: house_no = house_no.replace(",", "") if house_no is None: - # It's hard for us to get a reliable match - # filtered = filtered[filtered["Property Address"].str.contains(address["ADDRESS1"])] - # if filtered.shape[0] > 1: - # raise Exception("No valid - maybe we should do levenstein?") - continue + # If the house number is missing, it means that we usually have a named property so we look for an + # exact match on that name + filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())] + if filtered.shape[0] != 1: + continue else: From f53ce8b4302482ce54785e9da807c6b6ad9296b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 10 Oct 2024 18:51:12 +0100 Subject: [PATCH 3/5] allow postcode filtering --- etl/customers/gla/proposal_investigation.py | 25 ++++++++++++--------- etl/ownership/Ownership.py | 8 ++++++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py index 57df0554..776bbc59 100644 --- a/etl/customers/gla/proposal_investigation.py +++ b/etl/customers/gla/proposal_investigation.py @@ -1,9 +1,12 @@ """ This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant """ -from nis import match +import inspect +import requests +import json import pandas as pd +from pathlib import Path from etl.ownership.Ownership import Ownership postcodes = pd.read_excel( @@ -19,7 +22,7 @@ postcodes.columns = ['postcode', 'Local Authority'] postcodes = postcodes.drop([0, 1]) # Since there are a large number of potcodes (425k), let's just take a few examples # Take postcodes that begin with "BN15" -postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] +# postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] # The Local Authority is Adur, so let's get the EPC data for this area # epc_data = pd.read_csv( @@ -39,11 +42,14 @@ postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] # ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100 # Can we identify the owners of these units so we can contact them? + +file_src = inspect.getfile(lambda x: None) +DATA_DIRECTORY = Path(file_src).parent / "local_data" / "all-domestic-certificates" +epc_paths = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] +epc_paths = [str(entry / "certificates.csv") for entry in epc_paths] + ownership = Ownership( - epc_paths=[ - "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223" - "-Adur/certificates.csv" - ], + epc_paths=epc_paths, domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv", overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv", land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv", @@ -53,11 +59,11 @@ ownership = Ownership( portfolio_value=0, excluded_owners=[], excluded_uprns=[], - save=False + save=True ) # Data will be found at ownership/gla-proposal -ownership.source_epc_properties(column_filters={}) +ownership.source_epc_properties(column_filters={}, postcodes=postcodes["postcode"].str.lower().tolist()) # Step 2: Get company ownership data ownership.load_company_ownership() @@ -83,9 +89,6 @@ owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f" -import requests -import json - company_number = "13197205" url = f'https://api.company-information.service.gov.uk/company/{company_number}' diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 52181452..2c04ac8a 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -175,7 +175,7 @@ class Ownership: # Prepare the final outputs: self.create_final_matches() - def source_epc_properties(self, column_filters=None): + def source_epc_properties(self, column_filters=None, postcodes=None): """ This function will filter the epc data as specified by column filters, searching across all of the EPC tables :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This @@ -183,6 +183,7 @@ class Ownership: {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that column. If a column is not found in the EPC data, an exception is raised. + :param postcodes: A list of postcodes to filter the data on """ column_filters = {} if column_filters is None else column_filters @@ -206,6 +207,11 @@ class Ownership: else: raise Exception(f"Column {column} not found in data. column_filters is malformed") + if postcodes is not None: + epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)] + if epc_data.empty: + continue + data.append(epc_data) self.epc_data = pd.concat(data, ignore_index=True) From 722a3dba55271454e8482c42494baa66572dec29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 11 Oct 2024 10:16:48 +0100 Subject: [PATCH 4/5] working on gla proposal --- etl/customers/gla/proposal_investigation.py | 50 +++++++++++++++--- etl/ownership/Ownership.py | 56 +++++++++++---------- 2 files changed, 72 insertions(+), 34 deletions(-) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py index 776bbc59..05df6be7 100644 --- a/etl/customers/gla/proposal_investigation.py +++ b/etl/customers/gla/proposal_investigation.py @@ -10,16 +10,18 @@ from pathlib import Path from etl.ownership.Ownership import Ownership postcodes = pd.read_excel( - "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes' + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes' ) -# Take just the first two columns +# Take just the first three columns postcodes = postcodes[ - ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1'] + ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2'] ] -postcodes.columns = ['postcode', 'Local Authority'] +postcodes.columns = ['postcode', 'Local Authority', 'London Borough?'] # Drop the first row postcodes = postcodes.drop([0, 1]) +# Take just the London Boroughs +postcodes = postcodes[postcodes["London Borough?"] == "Yes"] # Since there are a large number of potcodes (425k), let's just take a few examples # Take postcodes that begin with "BN15" # postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] @@ -74,22 +76,46 @@ ownership.prepare_for_matching() # Step 4: Match EPC data to ownership data ownership.match() +from utils.s3 import save_excel_to_s3 + +# Save the data to S3 +save_excel_to_s3( + df=ownership.matched_addresses, + bucket_name=ownership.bucket, + file_key=ownership.matched_addresses_pre_filter_filepath +) + # We have the matches, which we now need to match to the postcodes matches = ownership.matched_addresses.copy() # filter matches on the postcodes we're interested in matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] # Remove any social transactions -matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])] +matches = matches[~matches["TENURE"].isin( + ["Rented (social)", "rental (social)", + "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be " + "used for an existing dwelling", "NO DATA!"]) +] +# Look at the EPC ratings +epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index() +epc_ratings.columns = ["EPC Rating", "Count"] +epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100 + +# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds +matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])] +# 11,694 properties -matches.head() owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index() owners_count.columns = ['Owner', 'Owner Registration #', 'Count'] owners_count = owners_count.sort_values('Count', ascending=False) owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100 +# Take an example postal region +matches = matches.sort_values("epc_postcode", ascending=True) +example = matches[matches["epc_postcode"].str.startswith("BR1 ")].copy() + companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f" -company_number = "13197205" +company_number = example.head(1)["Company Registration No. (1)"].values[0] url = f'https://api.company-information.service.gov.uk/company/{company_number}' # Make the API request @@ -102,7 +128,17 @@ if response.status_code == 200: print(json.dumps(company_data, indent=4)) else: print(f"Failed to fetch data. Status code: {response.status_code}") + # Try appending a zero the beginning of the company number + company_number = f"0{company_number}" + url = f'https://api.company-information.service.gov.uk/company/{company_number}' + response = requests.get(url, auth=(companies_house_api_key, '')) + company_data = response.json() + +from pprint import pprint + +pprint(company_data) psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control' psc_response = requests.get(psc_url, auth=(companies_house_api_key, '')) psc_data = psc_response.json() +pprint(psc_data) diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 2c04ac8a..68dee9ed 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -161,16 +161,17 @@ class Ownership: # Step 5: Match land registry data to existing matches self.match_with_land_registry() # We store this data in s3 before we perform any filtering - save_excel_to_s3( - df=self.matched_addresses, - bucket_name=self.bucket, - file_key=self.matched_addresses_pre_filter_filepath - ) - save_excel_to_s3( - df=self.combined_matching_lookup, - bucket_name=self.bucket, - file_key=self.combined_matching_lookup_pre_filter_filepath - ) + if self.save: + save_excel_to_s3( + df=self.matched_addresses, + bucket_name=self.bucket, + file_key=self.matched_addresses_pre_filter_filepath + ) + save_excel_to_s3( + df=self.combined_matching_lookup, + bucket_name=self.bucket, + file_key=self.combined_matching_lookup_pre_filter_filepath + ) # Prepare the final outputs: self.create_final_matches() @@ -1013,25 +1014,26 @@ class Ownership: if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique(): raise ValueError("Portfolio properties and epc data don't match") - logger.info("Storing final outpus") - # Store data - save_excel_to_s3( - df=self.portfolio_owners, - bucket_name=self.bucket, - file_key=self.portfolio_owners_filepath, - ) + if self.save: + logger.info("Storing final outpus") + # Store data + save_excel_to_s3( + df=self.portfolio_owners, + bucket_name=self.bucket, + file_key=self.portfolio_owners_filepath, + ) - save_excel_to_s3( - df=self.portfolio_properties, - bucket_name=self.bucket, - file_key=self.portfolio_properties_filepath, - ) + save_excel_to_s3( + df=self.portfolio_properties, + bucket_name=self.bucket, + file_key=self.portfolio_properties_filepath, + ) - save_excel_to_s3( - df=self.portfolio_epc_data, - bucket_name=self.bucket, - file_key=self.portfolio_epc_data_filepath, - ) + save_excel_to_s3( + df=self.portfolio_epc_data, + bucket_name=self.bucket, + file_key=self.portfolio_epc_data_filepath, + ) def get_asset_list(self): """ From dadbb0ef61d4cb402029158ffb0acad3cec2ad22 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 16 Oct 2024 10:19:35 +0100 Subject: [PATCH 5/5] Finished GLA proposal --- backend/Property.py | 2 +- backend/app/plan/router.py | 6 +++- backend/app/plan/schemas.py | 3 ++ backend/ml_models/Valuation.py | 2 ++ etl/customers/gla/example_model_outputs.py | 38 ++++++++++++++++++++ etl/customers/gla/proposal_investigation.py | 39 ++++++++++++++++++--- 6 files changed, 83 insertions(+), 7 deletions(-) create mode 100644 etl/customers/gla/example_model_outputs.py diff --git a/backend/Property.py b/backend/Property.py index ab8930c5..79108dc1 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -1204,7 +1204,7 @@ class Property: return False suitable_house = self.data["property-type"] == "House" and self.data["built-form"] in [ - "Detached", "Semi-Detached", + "Detached", "Semi-Detached", "End-Terrace", ] suitable_bungalow = self.data["property-type"] == "Bungalow" and self.data["built-form"] in [ diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index f4924c71..3b91a461 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -543,7 +543,11 @@ async def trigger_plan(body: PlanTriggerRequest): representative_recommendations = {} for p in tqdm(input_properties): recommender = Recommendations( - property_instance=p, materials=materials, exclusions=body.exclusions, inclusions=body.inclusions + property_instance=p, + materials=materials, + exclusions=body.exclusions, + inclusions=body.inclusions, + default_u_values=body.default_u_values ) property_recommendations, property_representative_recommendations = recommender.recommend() diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index 0d58c7e9..4b43db80 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -89,6 +89,9 @@ class PlanTriggerRequest(BaseModel): # if False, allows optimisation to be switched off optimise: Optional[bool] = True + # If True, uses default u-values for models + default_u_values: Optional[bool] = True + _allowed_goals = {"Increasing EPC"} _allowed_housing_types = {"Social", "Private"} diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index cbcebb9f..68432577 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -103,6 +103,8 @@ class PropertyValuation: # Vander Elliot Intrusive surveys 12103116: 1_537_000, 12103117: 1_404_000, + # GLA Proposal + 100020606627: 409_000 } # We base our valuation uplifts on a number of sources diff --git a/etl/customers/gla/example_model_outputs.py b/etl/customers/gla/example_model_outputs.py new file mode 100644 index 00000000..e239c43d --- /dev/null +++ b/etl/customers/gla/example_model_outputs.py @@ -0,0 +1,38 @@ +import pandas as pd +from utils.s3 import save_csv_to_s3 + +asset_list = [ + { + "address": "4, King Henrys Drive", + "postcode": "CR0 0PA" + }, +] +portfolio_id = 110 +user_id = 8 + +asset_list = pd.DataFrame(asset_list) + +filename = f"{user_id}/{portfolio_id}/asset_list.csv" +save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename +) + +body1 = { + "portfolio_id": str(portfolio_id), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "A", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": "", + "inclusions": [ + "cavity_wall_insulation", "loft_insulation", "air_source_heat_pump", "solar_pv" + ], + "budget": None, + "scenario_name": "Whole House", + "multi_plan": False, +} +print(body1) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py index 05df6be7..f6a87af1 100644 --- a/etl/customers/gla/proposal_investigation.py +++ b/etl/customers/gla/proposal_investigation.py @@ -76,13 +76,20 @@ ownership.prepare_for_matching() # Step 4: Match EPC data to ownership data ownership.match() -from utils.s3 import save_excel_to_s3 +from utils.s3 import save_excel_to_s3, read_excel_from_s3 # Save the data to S3 -save_excel_to_s3( - df=ownership.matched_addresses, +# save_excel_to_s3( +# df=ownership.matched_addresses, +# bucket_name=ownership.bucket, +# file_key=ownership.matched_addresses_pre_filter_filepath +# ) + +# Read in matches +matches = read_excel_from_s3( bucket_name=ownership.bucket, - file_key=ownership.matched_addresses_pre_filter_filepath + file_key="ownership/gla-proposal/2024-10-10 19:02:34.131365/matched_addresses_pre_filter.xlsx", + header_row=0 ) # We have the matches, which we now need to match to the postcodes @@ -95,6 +102,7 @@ matches = matches[~matches["TENURE"].isin( "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be " "used for an existing dwelling", "NO DATA!"]) ] +matches["is_prs"] = matches["TENURE"].isin(["rental (private)", "Rented (private)"]) # Look at the EPC ratings epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index() epc_ratings.columns = ["EPC Rating", "Count"] @@ -103,6 +111,8 @@ epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * # Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])] # 11,694 properties +matches["epc_postcode"].nunique() +# 6899 owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index() owners_count.columns = ['Owner', 'Owner Registration #', 'Count'] @@ -111,7 +121,26 @@ owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() # Take an example postal region matches = matches.sort_values("epc_postcode", ascending=True) -example = matches[matches["epc_postcode"].str.startswith("BR1 ")].copy() +# BR1, BR5 +example = matches[matches["epc_postcode"].str.startswith("CR0 ")].copy() +example = example[example["TENURE"].isin(["rental (private)", "Rented (private)"])] + +pd.set_option('display.max_rows', 500) +pd.set_option('display.max_columns', 500) +pd.set_option('display.width', 1000) +example[ + ["epc_address", "epc_postcode", "CURRENT_ENERGY_RATING", "CURRENT_ENERGY_EFFICIENCY", "Proprietor Name (1)", + "Company Registration No. (1)"] +].head(4) + +ownership.epc_data["UPRN"] = ownership.epc_data["UPRN"].astype(int) +example = example.merge( + ownership.epc_data[["UPRN", "BUILT_FORM", "PROPERTY_TYPE", "WALLS_DESCRIPTION", "ROOF_DESCRIPTION"]], + on="UPRN", + how="left" +) +z = example[example["CURRENT_ENERGY_RATING"] == "E"] +z = z[z["TENURE"].isin(["rental (private)", "Rented (private)"])] companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f"