From 722a3dba55271454e8482c42494baa66572dec29 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 11 Oct 2024 10:16:48 +0100
Subject: [PATCH] working on gla proposal

---
 etl/customers/gla/proposal_investigation.py | 50 +++++++++++++++---
 etl/ownership/Ownership.py                  | 56 +++++++++++----------
 2 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py
index 776bbc59..05df6be7 100644
--- a/etl/customers/gla/proposal_investigation.py
+++ b/etl/customers/gla/proposal_investigation.py
@@ -10,16 +10,18 @@ from pathlib import Path
 from etl.ownership.Ownership import Ownership
 
 postcodes = pd.read_excel(
-    "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes'
+    "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes'
 )
-# Take just the first two columns
+# Take just the first three columns
 postcodes = postcodes[
-    ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1']
+    ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2']
 ]
 
-postcodes.columns = ['postcode', 'Local Authority']
+postcodes.columns = ['postcode', 'Local Authority', 'London Borough?']
 # Drop the first row
 postcodes = postcodes.drop([0, 1])
+# Take just the London Boroughs
+postcodes = postcodes[postcodes["London Borough?"] == "Yes"]
 # Since there are a large number of potcodes (425k), let's just take a few examples
 # Take postcodes that begin with "BN15"
 # postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
@@ -74,22 +76,46 @@ ownership.prepare_for_matching()
 # Step 4: Match EPC data to ownership data
 ownership.match()
 
+from utils.s3 import save_excel_to_s3
+
+# Save the data to S3
+save_excel_to_s3(
+    df=ownership.matched_addresses,
+    bucket_name=ownership.bucket,
+    file_key=ownership.matched_addresses_pre_filter_filepath
+)
+
 # We have the matches, which we now need to match to the postcodes
 matches = ownership.matched_addresses.copy()
 # filter matches on the postcodes we're interested in
 matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())]
 # Remove any social transactions
-matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])]
+matches = matches[~matches["TENURE"].isin(
+    ["Rented (social)", "rental (social)",
+     "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be "
+     "used for an existing dwelling", "NO DATA!"])
+]
+# Look at the EPC ratings
+epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index()
+epc_ratings.columns = ["EPC Rating", "Count"]
+epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100
+
+# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds
+matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])]
+# 11,694 properties
 
-matches.head()
 owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index()
 owners_count.columns = ['Owner', 'Owner Registration #', 'Count']
 owners_count = owners_count.sort_values('Count', ascending=False)
 owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100
 
+# Take an example postal region
+matches = matches.sort_values("epc_postcode", ascending=True)
+example = matches[matches["epc_postcode"].str.startswith("BR1 ")].copy()
+
 companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f"
 
-company_number = "13197205"
+company_number = example.head(1)["Company Registration No. (1)"].values[0]
 url = f'https://api.company-information.service.gov.uk/company/{company_number}'
 
 # Make the API request
@@ -102,7 +128,17 @@ if response.status_code == 200:
     print(json.dumps(company_data, indent=4))
 else:
     print(f"Failed to fetch data. Status code: {response.status_code}")
+    # Try appending a zero the beginning of the company number
+    company_number = f"0{company_number}"
+    url = f'https://api.company-information.service.gov.uk/company/{company_number}'
+    response = requests.get(url, auth=(companies_house_api_key, ''))
+    company_data = response.json()
+
+from pprint import pprint
+
+pprint(company_data)
 
 psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control'
 psc_response = requests.get(psc_url, auth=(companies_house_api_key, ''))
 psc_data = psc_response.json()
+pprint(psc_data)
diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py
index 2c04ac8a..68dee9ed 100644
--- a/etl/ownership/Ownership.py
+++ b/etl/ownership/Ownership.py
@@ -161,16 +161,17 @@ class Ownership:
         # Step 5: Match land registry data to existing matches
         self.match_with_land_registry()
         # We store this data in s3 before we perform any filtering
-        save_excel_to_s3(
-            df=self.matched_addresses,
-            bucket_name=self.bucket,
-            file_key=self.matched_addresses_pre_filter_filepath
-        )
-        save_excel_to_s3(
-            df=self.combined_matching_lookup,
-            bucket_name=self.bucket,
-            file_key=self.combined_matching_lookup_pre_filter_filepath
-        )
+        if self.save:
+            save_excel_to_s3(
+                df=self.matched_addresses,
+                bucket_name=self.bucket,
+                file_key=self.matched_addresses_pre_filter_filepath
+            )
+            save_excel_to_s3(
+                df=self.combined_matching_lookup,
+                bucket_name=self.bucket,
+                file_key=self.combined_matching_lookup_pre_filter_filepath
+            )
 
         # Prepare the final outputs:
         self.create_final_matches()
@@ -1013,25 +1014,26 @@ class Ownership:
         if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique():
             raise ValueError("Portfolio properties and epc data don't match")
 
-        logger.info("Storing final outpus")
-        # Store data
-        save_excel_to_s3(
-            df=self.portfolio_owners,
-            bucket_name=self.bucket,
-            file_key=self.portfolio_owners_filepath,
-        )
+        if self.save:
+            logger.info("Storing final outpus")
+            # Store data
+            save_excel_to_s3(
+                df=self.portfolio_owners,
+                bucket_name=self.bucket,
+                file_key=self.portfolio_owners_filepath,
+            )
 
-        save_excel_to_s3(
-            df=self.portfolio_properties,
-            bucket_name=self.bucket,
-            file_key=self.portfolio_properties_filepath,
-        )
+            save_excel_to_s3(
+                df=self.portfolio_properties,
+                bucket_name=self.bucket,
+                file_key=self.portfolio_properties_filepath,
+            )
 
-        save_excel_to_s3(
-            df=self.portfolio_epc_data,
-            bucket_name=self.bucket,
-            file_key=self.portfolio_epc_data_filepath,
-        )
+            save_excel_to_s3(
+                df=self.portfolio_epc_data,
+                bucket_name=self.bucket,
+                file_key=self.portfolio_epc_data_filepath,
+            )
 
     def get_asset_list(self):
         """