wave 3 applications closed

2026-07-27 23:35:01 +00:00 · 2024-11-27 10:16:06 +00:00 · 2024-11-27 10:16:06 +00:00 · fff8f50f69
commit fff8f50f69
parent f6612c0cd4
7 changed files with 191 additions and 11 deletions
--- a/etl/customers/cottons/parse_pdf_asset_list.py
+++ b/etl/customers/cottons/parse_pdf_asset_list.py
@ -0,0 +1,64 @@
+import re
+import pandas as pd
+from PyPDF2 import PdfReader
+
+# Paths to the uploaded files
+file_paths = [
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
+    "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
+]
+
+
+# Function to extract text from PDFs
+def extract_text_from_pdf_with_pypdf2(file_path):
+    text = ""
+    reader = PdfReader(file_path)
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+
+
+# Initialize a list to hold all parsed data
+all_parsed_data = []
+
+# Process each PDF individually
+for i, path in enumerate(file_paths):
+    # Extract text from the PDF
+    extracted_text = extract_text_from_pdf_with_pypdf2(path)
+
+    # Step 1: Remove titles and repeated headers
+    cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
+    cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
+
+    # Step 2: Extract rows ending with "Managed"
+    rows = re.findall(r".*?Managed", cleaned_text)
+
+    # Step 3: Parse rows into structured data
+    parsed_data = []
+    for row in rows:
+        match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
+        if match:
+            code = match.group(1).strip()
+            address = match.group(2).strip()
+            parsed_data.append((code, address, "Managed"))
+
+    # Append parsed data to the global list
+    all_parsed_data.extend(parsed_data)
+
+    # Provide feedback for debugging
+    print(f"File {i + 1} processed: {len(parsed_data)} rows")
+
+# Step 4: Create a unified DataFrame
+final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
+
+# Step 5: Save the unified DataFrame to an Excel file
+final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
+final_df.to_excel(final_output_file_path, index=False)
+
+# Provide feedback
+print(f"All files processed and combined. Total rows: {len(final_df)}")
+print(f"Unified file saved to: {final_output_file_path}")
--- a/etl/customers/cottons/prep_asset_list.py
+++ b/etl/customers/cottons/prep_asset_list.py
@ -0,0 +1,15 @@
+import pandas as pd
+
+df = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx"
+)
+
+# split up the address on commas. First section is address1, last seciton is postcode
+df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip())
+df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip())
+
+# Re-save
+df.to_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx",
+    index=False,
+)
--- a/etl/customers/gla/hug_postcodes.py
+++ b/etl/customers/gla/hug_postcodes.py
@ -0,0 +1,46 @@
+import inspect
+import pandas as pd
+from pathlib import Path
+from tqdm import tqdm
+from etl.epc.settings import EARLIEST_EPC_DATE
+
+src_file_path = inspect.getfile(lambda: None)
+
+EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates")
+epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
+
+aggregation = []
+for directory in tqdm(epc_directories):
+    data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+    # Rename the columns to the same format as the api returns
+    data.columns = [c.replace("_", "-").lower() for c in data.columns]
+
+    data = data[data["posttown"].str.contains("London", case=False, na=False)]
+    if data.empty:
+        continue
+    # Take just date before the date threshold
+    data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
+
+    data = data[~pd.isnull(data["uprn"])]
+    # Take just the newest EPC per uprn, based on lodgement-date
+    data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
+    # Take EPC D and below
+    data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])]
+    data["postal_region"] = data["postcode"].str.split(" ").str[0]
+
+    # Take homes that don't have a gas boiler
+    off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)]
+
+    region_summary = off_gas.groupby("postal_region").size().reset_index(name="count")
+
+    aggregation.append(region_summary)
+
+postal_region_aggregation = pd.concat(aggregation)
+postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False)
+postal_region_aggregation = postal_region_aggregation.rename(
+    columns={"postal_region": "Postcode Region", "count": "Number of Homes"}
+)
+postal_region_aggregation.to_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx",
+    index=False
+)
--- a/etl/customers/ksquared/Wave3
+++ b/etl/customers/ksquared/Wave3
@ -305,7 +305,7 @@ def caha():

    # Get conservation area data
    uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
-    conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev")
+    conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev")

    addresses = pd.DataFrame(asset_list)
    addresses["uprn"] = addresses["uprn"].astype(str)
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -2591,5 +2591,21 @@ def propsed_wave_3_sample():
        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
    )

+    survey_results = pd.read_excel(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
+        header=13,
+        sheet_name="Modelled Packages"
+    )
+
+    indivual_units = pd.read_csv(
+        os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv")
+    )
+
+    u_aids = survey_results["Archetype ID"].astype(str).unique()
+    units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values
+
+    len({v for v in units_in_bid if str(v) in u_aids})
+    len(list(set(units_in_bid)))
+
 # if __name__ == "__main__":
 #     main()
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@ -375,3 +375,41 @@ def app():
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
        index=False
    )
+
+
+def cross_reference_epc_programme():
+    eco3_fallout = pd.read_excel(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
+        "SURVEYED - ECO3 NOT COMPLETED.xlsx"
+    )
+
+    eco3_fallout["house_number"] = eco3_fallout.apply(
+        lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
+    )
+
+    # for _, x in eco3_fallout.ite
+
+    stonewater_modelled_above_c = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+        "master sheet.csv",
+        encoding='latin1'
+    )
+
+    stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
+        lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
+    )
+
+    eco3_fallout_matched_to_above_c = []
+    for _, property in eco3_fallout.iterrows():
+        # Match on house number
+        match = stonewater_modelled_above_c[
+            stonewater_modelled_above_c["house_number"] == property["house_number"]
+            ]
+
+        # We do a fuzzy match on the address, with levenstein distance
+
+        from fuzzywuzzy import fuzz
+        match = stonewater_modelled_above_c[
+            stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
+        ]
+        match.head()
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -120,17 +120,17 @@ def app():
    Property UPRN

    """
-    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
-    DATA_FILENAME = "Bromford programme review.xlsx"
-    SHEET_NAME = "Bromford"
-    POSTCODE_COLUMN = "Postcode"
-    FULLADDRESS_COLUMN = None
-    ADDRESS1_COLUMN = "No."
-    ADDRESS1_METHOD = "first_two_words"
-    ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
+    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/"
+    DATA_FILENAME = "Cottons Asset List.xlsx"
+    SHEET_NAME = "Sheet1"
+    POSTCODE_COLUMN = "postcode"
+    FULLADDRESS_COLUMN = "Property Address"
+    ADDRESS1_COLUMN = "address1"
+    ADDRESS1_METHOD = None
+    ADDRESS_COLS_TO_CONCAT = []

    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
-    asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
+    # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
    asset_list["row_id"] = asset_list.index

    # We clean up portential non-breaking spaces, and double spaces
@ -202,7 +202,8 @@ def app():

    transformed_df = pd.DataFrame(transformed_data)
    # Drop the column that is ""
-    transformed_df = transformed_df.drop(columns=[""])
+    if "" in transformed_df.columns:
+        transformed_df = transformed_df.drop(columns=[""])

    # Get the find my epc data
    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(