From 722a3dba55271454e8482c42494baa66572dec29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 11 Oct 2024 10:16:48 +0100 Subject: [PATCH] working on gla proposal --- etl/customers/gla/proposal_investigation.py | 50 +++++++++++++++--- etl/ownership/Ownership.py | 56 +++++++++++---------- 2 files changed, 72 insertions(+), 34 deletions(-) diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py index 776bbc59..05df6be7 100644 --- a/etl/customers/gla/proposal_investigation.py +++ b/etl/customers/gla/proposal_investigation.py @@ -10,16 +10,18 @@ from pathlib import Path from etl.ownership.Ownership import Ownership postcodes = pd.read_excel( - "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes' + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes' ) -# Take just the first two columns +# Take just the first three columns postcodes = postcodes[ - ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1'] + ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2'] ] -postcodes.columns = ['postcode', 'Local Authority'] +postcodes.columns = ['postcode', 'Local Authority', 'London Borough?'] # Drop the first row postcodes = postcodes.drop([0, 1]) +# Take just the London Boroughs +postcodes = postcodes[postcodes["London Borough?"] == "Yes"] # Since there are a large number of potcodes (425k), let's just take a few examples # Take postcodes that begin with "BN15" # postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] @@ -74,22 +76,46 @@ ownership.prepare_for_matching() # Step 4: Match EPC data to ownership data ownership.match() +from utils.s3 import save_excel_to_s3 + +# Save the data to S3 +save_excel_to_s3( + df=ownership.matched_addresses, + bucket_name=ownership.bucket, + file_key=ownership.matched_addresses_pre_filter_filepath +) + # We have the matches, which we now need to match to the postcodes matches = ownership.matched_addresses.copy() # filter matches on the postcodes we're interested in matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] # Remove any social transactions -matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])] +matches = matches[~matches["TENURE"].isin( + ["Rented (social)", "rental (social)", + "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be " + "used for an existing dwelling", "NO DATA!"]) +] +# Look at the EPC ratings +epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index() +epc_ratings.columns = ["EPC Rating", "Count"] +epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100 + +# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds +matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])] +# 11,694 properties -matches.head() owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index() owners_count.columns = ['Owner', 'Owner Registration #', 'Count'] owners_count = owners_count.sort_values('Count', ascending=False) owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100 +# Take an example postal region +matches = matches.sort_values("epc_postcode", ascending=True) +example = matches[matches["epc_postcode"].str.startswith("BR1 ")].copy() + companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f" -company_number = "13197205" +company_number = example.head(1)["Company Registration No. (1)"].values[0] url = f'https://api.company-information.service.gov.uk/company/{company_number}' # Make the API request @@ -102,7 +128,17 @@ if response.status_code == 200: print(json.dumps(company_data, indent=4)) else: print(f"Failed to fetch data. Status code: {response.status_code}") + # Try appending a zero the beginning of the company number + company_number = f"0{company_number}" + url = f'https://api.company-information.service.gov.uk/company/{company_number}' + response = requests.get(url, auth=(companies_house_api_key, '')) + company_data = response.json() + +from pprint import pprint + +pprint(company_data) psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control' psc_response = requests.get(psc_url, auth=(companies_house_api_key, '')) psc_data = psc_response.json() +pprint(psc_data) diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 2c04ac8a..68dee9ed 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -161,16 +161,17 @@ class Ownership: # Step 5: Match land registry data to existing matches self.match_with_land_registry() # We store this data in s3 before we perform any filtering - save_excel_to_s3( - df=self.matched_addresses, - bucket_name=self.bucket, - file_key=self.matched_addresses_pre_filter_filepath - ) - save_excel_to_s3( - df=self.combined_matching_lookup, - bucket_name=self.bucket, - file_key=self.combined_matching_lookup_pre_filter_filepath - ) + if self.save: + save_excel_to_s3( + df=self.matched_addresses, + bucket_name=self.bucket, + file_key=self.matched_addresses_pre_filter_filepath + ) + save_excel_to_s3( + df=self.combined_matching_lookup, + bucket_name=self.bucket, + file_key=self.combined_matching_lookup_pre_filter_filepath + ) # Prepare the final outputs: self.create_final_matches() @@ -1013,25 +1014,26 @@ class Ownership: if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique(): raise ValueError("Portfolio properties and epc data don't match") - logger.info("Storing final outpus") - # Store data - save_excel_to_s3( - df=self.portfolio_owners, - bucket_name=self.bucket, - file_key=self.portfolio_owners_filepath, - ) + if self.save: + logger.info("Storing final outpus") + # Store data + save_excel_to_s3( + df=self.portfolio_owners, + bucket_name=self.bucket, + file_key=self.portfolio_owners_filepath, + ) - save_excel_to_s3( - df=self.portfolio_properties, - bucket_name=self.bucket, - file_key=self.portfolio_properties_filepath, - ) + save_excel_to_s3( + df=self.portfolio_properties, + bucket_name=self.bucket, + file_key=self.portfolio_properties_filepath, + ) - save_excel_to_s3( - df=self.portfolio_epc_data, - bucket_name=self.bucket, - file_key=self.portfolio_epc_data_filepath, - ) + save_excel_to_s3( + df=self.portfolio_epc_data, + bucket_name=self.bucket, + file_key=self.portfolio_epc_data_filepath, + ) def get_asset_list(self): """