Model/etl/customers/gla/proposal_investigation.py

"""
This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant
"""

import inspect
import requests
import json
import pandas as pd
from pathlib import Path
from etl.ownership.Ownership import Ownership

postcodes = pd.read_excel(
    "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes_RP edit.xlsx", sheet_name='Eligible postcodes'
)
# Take just the first three columns
postcodes = postcodes[
    ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1', 'Unnamed: 2']
]

postcodes.columns = ['postcode', 'Local Authority', 'London Borough?']
# Drop the first row
postcodes = postcodes.drop([0, 1])
# Take just the London Boroughs
postcodes = postcodes[postcodes["London Borough?"] == "Yes"]
# Since there are a large number of potcodes (425k), let's just take a few examples
# Take postcodes that begin with "BN15"
# postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]

# The Local Authority is Adur, so let's get the EPC data for this area
# epc_data = pd.read_csv(
#     "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur"
#     "/certificates.csv", low_memory=False
# )
# # Filter on these postcodes
# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())]
# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE
# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"])
# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
#
# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total
# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index()
# ratings_distribution.columns = ["Rating", "Count"]
# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100

# Can we identify the owners of these units so we can contact them?

file_src = inspect.getfile(lambda x: None)
DATA_DIRECTORY = Path(file_src).parent / "local_data" / "all-domestic-certificates"
epc_paths = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
epc_paths = [str(entry / "certificates.csv") for entry in epc_paths]

ownership = Ownership(
    epc_paths=epc_paths,
    domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv",
    overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv",
    land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv",
    project_name="gla-proposal",
    bucket="retrofit-data-dev",
    average_property_value=0,
    portfolio_value=0,
    excluded_owners=[],
    excluded_uprns=[],
    save=True
)

# Data will be found at ownership/gla-proposal
ownership.source_epc_properties(column_filters={}, postcodes=postcodes["postcode"].str.lower().tolist())

# Step 2: Get company ownership data
ownership.load_company_ownership()

# Step 3: Prepare data for matching
ownership.prepare_for_matching()

# Step 4: Match EPC data to ownership data
ownership.match()

from utils.s3 import save_excel_to_s3, read_excel_from_s3

# Save the data to S3
# save_excel_to_s3(
#     df=ownership.matched_addresses,
#     bucket_name=ownership.bucket,
#     file_key=ownership.matched_addresses_pre_filter_filepath
# )

# Read in matches
matches = read_excel_from_s3(
    bucket_name=ownership.bucket,
    file_key="ownership/gla-proposal/2024-10-10 19:02:34.131365/matched_addresses_pre_filter.xlsx",
    header_row=0
)

# We have the matches, which we now need to match to the postcodes
matches = ownership.matched_addresses.copy()
# filter matches on the postcodes we're interested in
matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())]
# Remove any social transactions
matches = matches[~matches["TENURE"].isin(
    ["Rented (social)", "rental (social)",
     "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be "
     "used for an existing dwelling", "NO DATA!"])
]
matches["is_prs"] = matches["TENURE"].isin(["rental (private)", "Rented (private)"])
# Look at the EPC ratings
epc_ratings = matches.groupby(["CURRENT_ENERGY_RATING"]).size().reset_index()
epc_ratings.columns = ["EPC Rating", "Count"]
epc_ratings["Percentage"] = epc_ratings["Count"] / epc_ratings["Count"].sum() * 100

# Take properties that are below an EPC C rating, as defined by the guidance and remove any new builds
matches = matches[matches["CURRENT_ENERGY_RATING"].isin(["D", "E", "F", "G"])]
# 11,694 properties
matches["epc_postcode"].nunique()
# 6899

owners_count = matches.groupby(['Proprietor Name (1)', 'Company Registration No. (1)']).size().reset_index()
owners_count.columns = ['Owner', 'Owner Registration #', 'Count']
owners_count = owners_count.sort_values('Count', ascending=False)
owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum() * 100

# Take an example postal region
matches = matches.sort_values("epc_postcode", ascending=True)
# BR1, BR5
example = matches[matches["epc_postcode"].str.startswith("CR0 ")].copy()
example = example[example["TENURE"].isin(["rental (private)", "Rented (private)"])]

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
example[
    ["epc_address", "epc_postcode", "CURRENT_ENERGY_RATING", "CURRENT_ENERGY_EFFICIENCY", "Proprietor Name (1)",
     "Company Registration No. (1)"]
].head(4)

ownership.epc_data["UPRN"] = ownership.epc_data["UPRN"].astype(int)
example = example.merge(
    ownership.epc_data[["UPRN", "BUILT_FORM", "PROPERTY_TYPE", "WALLS_DESCRIPTION", "ROOF_DESCRIPTION"]],
    on="UPRN",
    how="left"
)
z = example[example["CURRENT_ENERGY_RATING"] == "E"]
z = z[z["TENURE"].isin(["rental (private)", "Rented (private)"])]

companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f"

company_number = example.head(1)["Company Registration No. (1)"].values[0]
url = f'https://api.company-information.service.gov.uk/company/{company_number}'

# Make the API request
response = requests.get(url, auth=(companies_house_api_key, ''))

# Check if the request was successful
if response.status_code == 200:
    company_data = response.json()
    # Pretty-print the fetched data
    print(json.dumps(company_data, indent=4))
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")
    # Try appending a zero the beginning of the company number
    company_number = f"0{company_number}"
    url = f'https://api.company-information.service.gov.uk/company/{company_number}'
    response = requests.get(url, auth=(companies_house_api_key, ''))
    company_data = response.json()

from pprint import pprint

pprint(company_data)

psc_url = f'https://api.company-information.service.gov.uk/company/{company_number}/persons-with-significant-control'
psc_response = requests.get(psc_url, auth=(companies_house_api_key, ''))
psc_data = psc_response.json()
pprint(psc_data)