adding in the basic structure of the extraction code

2026-06-08 11:17:27 +00:00 · 2024-10-28 10:57:26 +00:00 · 2024-10-28 10:57:26 +00:00 · 7513e475d3
commit 7513e475d3
parent 8325f1bf7a
4 changed files with 95 additions and 2 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
  <component name="PyCharmProfessionalAdvertiser">
    <option name="shown" value="true" />
  </component>
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -0,0 +1,92 @@
+import os
+import PyPDF2
+import re
+
+FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
+
+
+def extract_summary_report(pdf_path):
+    """
+    Extracts specific data from the provided PDF file.
+    Data includes:
+    - Current SAP rating
+    - Fuel Bill
+    - Emissions (t/year)
+    """
+    data = {
+        "Current SAP rating": None,
+        "Fuel Bill": None,
+        "Emissions (t/year)": None,
+    }
+
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+
+        # Extract Current SAP rating
+        sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+        if sap_match:
+            data["Current SAP rating"] = sap_match.group(1)
+
+        # Extract Fuel Bill
+        fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+        if fuel_bill_match:
+            data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+        # Extract Emissions
+        emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
+        if emissions_match:
+            data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
+
+    return data
+
+
+def main():
+    """
+    This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
+    """
+    # List only directories in the specified FILE_PATH
+    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+
+    extracted_data = []
+    for survey_folder in survey_folders:
+        # List the folders inside of the survey folder
+        survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
+                             if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
+
+        if not survey_subfolders:
+            continue
+
+        # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
+        # If it exists, we will use the data from that folder
+        retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+        # List contents of the retrofit folder
+        retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
+
+        if not retrofit_files:
+            continue
+
+        # We now look for specific files:
+        # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
+        # .pdf
+        summary_report = next(
+            (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
+        )
+        if summary_report is not None:
+            pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
+            summary_data = extract_summary_report(pdf_path)
+            summary_data = {
+                "survey_folder": survey_folder,
+                **summary_data
+            }
+            extracted_data.append(summary_data)
+            continue
+
+        raise NotImplementedError("IMPLEMENT ME!")
+
+
+if __name__ == "__main__":
+    main()
--- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@ -0,0 +1 @@
+PyPDF2