From fff8f50f69cad56ffe353bdf2ab0aa6f2d12573e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 27 Nov 2024 10:16:06 +0000
Subject: [PATCH] wave 3 applications closed

---
 etl/customers/cottons/parse_pdf_asset_list.py | 64 +++++++++++++++++++
 etl/customers/cottons/prep_asset_list.py      | 15 +++++
 etl/customers/gla/hug_postcodes.py            | 46 +++++++++++++
 etl/customers/ksquared/Wave3 Modelling.py     |  2 +-
 .../stonewater/Wave 3 Preparation.py          | 16 +++++
 .../stonewater/potential_eco_properties.py    | 38 +++++++++++
 etl/route_march_data_pull/app.py              | 21 +++---
 7 files changed, 191 insertions(+), 11 deletions(-)
 create mode 100644 etl/customers/cottons/parse_pdf_asset_list.py
 create mode 100644 etl/customers/cottons/prep_asset_list.py
 create mode 100644 etl/customers/gla/hug_postcodes.py

diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py
new file mode 100644
index 00000000..7d442e97
--- /dev/null
+++ b/etl/customers/cottons/parse_pdf_asset_list.py
@@ -0,0 +1,64 @@
+import re
+import pandas as pd
+from PyPDF2 import PdfReader
+
+# Paths to the uploaded files
+file_paths = [
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
+]
+
+
+# Function to extract text from PDFs
+def extract_text_from_pdf_with_pypdf2(file_path):
+    text = ""
+    reader = PdfReader(file_path)
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+
+
+# Initialize a list to hold all parsed data
+all_parsed_data = []
+
+# Process each PDF individually
+for i, path in enumerate(file_paths):
+    # Extract text from the PDF
+    extracted_text = extract_text_from_pdf_with_pypdf2(path)
+
+    # Step 1: Remove titles and repeated headers
+    cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
+    cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
+
+    # Step 2: Extract rows ending with "Managed"
+    rows = re.findall(r".*?Managed", cleaned_text)
+
+    # Step 3: Parse rows into structured data
+    parsed_data = []
+    for row in rows:
+        match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
+        if match:
+            code = match.group(1).strip()
+            address = match.group(2).strip()
+            parsed_data.append((code, address, "Managed"))
+
+    # Append parsed data to the global list
+    all_parsed_data.extend(parsed_data)
+
+    # Provide feedback for debugging
+    print(f"File {i + 1} processed: {len(parsed_data)} rows")
+
+# Step 4: Create a unified DataFrame
+final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
+
+# Step 5: Save the unified DataFrame to an Excel file
+final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
+final_df.to_excel(final_output_file_path, index=False)
+
+# Provide feedback
+print(f"All files processed and combined. Total rows: {len(final_df)}")
+print(f"Unified file saved to: {final_output_file_path}")
diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py
new file mode 100644
index 00000000..db7c6583
--- /dev/null
+++ b/etl/customers/cottons/prep_asset_list.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+df = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx"
+)
+
+# split up the address on commas. First section is address1, last seciton is postcode
+df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip())
+df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip())
+
+# Re-save
+df.to_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx",
+    index=False,
+)
diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py
new file mode 100644
index 00000000..85783d62
--- /dev/null
+++ b/etl/customers/gla/hug_postcodes.py
@@ -0,0 +1,46 @@
+import inspect
+import pandas as pd
+from pathlib import Path
+from tqdm import tqdm
+from etl.epc.settings import EARLIEST_EPC_DATE
+
+src_file_path = inspect.getfile(lambda: None)
+
+EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates")
+epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
+
+aggregation = []
+for directory in tqdm(epc_directories):
+    data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+    # Rename the columns to the same format as the api returns
+    data.columns = [c.replace("_", "-").lower() for c in data.columns]
+
+    data = data[data["posttown"].str.contains("London", case=False, na=False)]
+    if data.empty:
+        continue
+    # Take just date before the date threshold
+    data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
+
+    data = data[~pd.isnull(data["uprn"])]
+    # Take just the newest EPC per uprn, based on lodgement-date
+    data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
+    # Take EPC D and below
+    data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])]
+    data["postal_region"] = data["postcode"].str.split(" ").str[0]
+
+    # Take homes that don't have a gas boiler
+    off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)]
+
+    region_summary = off_gas.groupby("postal_region").size().reset_index(name="count")
+
+    aggregation.append(region_summary)
+
+postal_region_aggregation = pd.concat(aggregation)
+postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False)
+postal_region_aggregation = postal_region_aggregation.rename(
+    columns={"postal_region": "Postcode Region", "count": "Number of Homes"}
+)
+postal_region_aggregation.to_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx",
+    index=False
+)
diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py
index 7bfa33b3..0bf6eb18 100644
--- a/etl/customers/ksquared/Wave3 Modelling.py	
+++ b/etl/customers/ksquared/Wave3 Modelling.py	
@@ -305,7 +305,7 @@ def caha():
 
     # Get conservation area data
     uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
-    conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev")
+    conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev")
 
     addresses = pd.DataFrame(asset_list)
     addresses["uprn"] = addresses["uprn"].astype(str)
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index b6c29863..77200e69 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2591,5 +2591,21 @@ def propsed_wave_3_sample():
         os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
     )
 
+    survey_results = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+
+    indivual_units = pd.read_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv")
+    )
+
+    u_aids = survey_results["Archetype ID"].astype(str).unique()
+    units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values
+
+    len({v for v in units_in_bid if str(v) in u_aids})
+    len(list(set(units_in_bid)))
+
 # if __name__ == "__main__":
 #     main()
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
index 4fb89113..c0301e9a 100644
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -375,3 +375,41 @@ def app():
         "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
         index=False
     )
+
+
+def cross_reference_epc_programme():
+    eco3_fallout = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
+        "SURVEYED - ECO3 NOT COMPLETED.xlsx"
+    )
+
+    eco3_fallout["house_number"] = eco3_fallout.apply(
+        lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
+    )
+
+    # for _, x in eco3_fallout.ite
+
+    stonewater_modelled_above_c = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+        "master sheet.csv",
+        encoding='latin1'
+    )
+
+    stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
+        lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
+    )
+
+    eco3_fallout_matched_to_above_c = []
+    for _, property in eco3_fallout.iterrows():
+        # Match on house number
+        match = stonewater_modelled_above_c[
+            stonewater_modelled_above_c["house_number"] == property["house_number"]
+            ]
+
+        # We do a fuzzy match on the address, with levenstein distance
+
+        from fuzzywuzzy import fuzz
+        match = stonewater_modelled_above_c[
+            stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
+        ]
+        match.head()
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
index 6f9dd135..b53b36c2 100644
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@@ -120,17 +120,17 @@ def app():
     Property UPRN
 
     """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
-    DATA_FILENAME = "Bromford programme review.xlsx"
-    SHEET_NAME = "Bromford"
-    POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = None
-    ADDRESS1_COLUMN = "No."
-    ADDRESS1_METHOD = "first_two_words"
-    ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/"
+    DATA_FILENAME = "Cottons Asset List.xlsx"
+    SHEET_NAME = "Sheet1"
+    POSTCODE_COLUMN = "postcode"
+    FULLADDRESS_COLUMN = "Property Address"
+    ADDRESS1_COLUMN = "address1"
+    ADDRESS1_METHOD = None
+    ADDRESS_COLS_TO_CONCAT = []
 
     asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
-    asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
+    # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
     asset_list["row_id"] = asset_list.index
 
     # We clean up portential non-breaking spaces, and double spaces
@@ -202,7 +202,8 @@ def app():
 
     transformed_df = pd.DataFrame(transformed_data)
     # Drop the column that is ""
-    transformed_df = transformed_df.drop(columns=[""])
+    if "" in transformed_df.columns:
+        transformed_df = transformed_df.drop(columns=[""])
 
     # Get the find my epc data
     find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(